forked from ssvas1997/Stack-Overflow-Tag-Prediction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Final.py
71 lines (67 loc) · 2.53 KB
/
Final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import time, re, json, numpy as np, sys, csv
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from nltk.stem.snowball import SnowballStemmer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
#fh=open('Tags.txt','rt',encoding="utf-8")
#fh2=open('cleaned.txt','rt',encoding="utf-8")
#_i=[]
#_t=[]
#_T=[]
#_b=[]
s=set(stopwords.words('english'))
stemmer = SnowballStemmer('english', ignore_stopwords=True)
count=0
#tagrows=fh.read().split('\n')[:248000]
#checktags=[]
#X=fh2.read().split('\n')[:248000]
classifier = joblib.load('clf.txt')
multibin = joblib.load('multibin.txt')
vectorizer_2=CountVectorizer()
def predictTags():
T=[]
#words = str(self.lineEdit.text())+' '+str(self.plainTextEdit.toPlainText())
words=input("Enter question:")
words = re.sub('\n',' ',words)
words = re.sub('[!@%^&*()$:"?<>=~,;`{}|]',' ',words)
words = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))''',' ',words)
words = re.sub('_','-',words)
words = words.replace('[',' ')
words = words.replace(']',' ')
words = words.replace('/',' ')
words = words.replace('\\',' ')
words = re.sub(r'(\s)\-+(\s)',r'\1', words)
words = re.sub(r'\.+(\s)',r'\1', words)
words = re.sub(r'\.+\.(\w)',r'\1', words)
words = re.sub(r'(\s)\.+(\s)',r'\1', words)
words = re.sub("'",'', words)
words = re.sub(r'\s\d+[\.\-\+]+\d+|\s[\.\-\+]+\d+|\s+\d+\s+|\s\d+[\+\-]+',' ',words)
words = re.sub("^\d+\s|\s\d+\s|\s\d+$"," ", words)
words = re.sub(r'\s\#+\s|\s\++\s',' ',words)
stemmed_words = [stemmer.stem(word) for word in words.split()]
clean_text = filter(lambda w: not w in s,stemmed_words)
words=''
for word in clean_text:
words+=word+' '
T.append(words)
print("T",T)
results=classifier.predict(T)
results=multibin.inverse_transform(results)
#print '\n',results,'\n'
buff=''
tagarr=[]
for result in results[0]:
#buff=buff+QString(result)+' ; '
tagarr.append(result)
#self.lineEdit_2.setText(buff[:len(buff)-3])
#recommend()
print(tagarr)
predictTags()