-
Notifications
You must be signed in to change notification settings - Fork 0
/
processed.py
70 lines (47 loc) · 1.37 KB
/
processed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from difflib import diff_bytes
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import csv
import re
def rem_html_tags(question):
regex = re.compile('<.*?>')
form= regex.sub('', question)
return form
def removePunct(question):
question = re.sub('\W+',' ', question)
question = question.strip()
return question
def tokenize(to_token):
example_sent =to_token
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
return filtered_sentence
def listToString(s):
str1 = " "
return (str1.join(s))
global df
df = pd.read_csv('stackoverflow_new.csv')
def preprocess(df):
size = 0
csv_file = open("empty.csv",'w',newline='')
text_list=[]
for i in range(len(df)):
size = size + 1
question = df['Text'][i]
tokens=tokenize(question)
text_pre =listToString(tokens)
#print(text_pre)
temp=[]
temp.append(text_pre)
text_list.append(temp)
with csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerows(text_list)
print("preprocess completed")
preprocess(df)