-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess.py
110 lines (102 loc) · 4.75 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores(sentence):
return analyser.polarity_scores(sentence)['compound']
def clean_tweets(filepath, emoticons):
text = open(filepath, 'r', encoding="utf8").read()
tweets_list = text.splitlines() # split the tweets in lines
# emoticons dictionary
all_tweets = []
for tweet in tweets_list: # for each tweet
cur_tweet = []
words = tweet.split() # split the tweet into a list of words
reformed = [emoticons[word] if word in emoticons else word for word in words] # replace every emoticon in the tweet(if any) with its word mapping
new_tweet = " ".join(reformed) # reconstruct the tweet
words = new_tweet.split() # split the new tweet into a list of words
for word in words[3:]: # actual tweet starts at index 3
if (word[0] == '@') or (word[0] == '#') or (word[:4] == 'http'): # ignore @somename, #somehashtag and http(s)://somelink
continue
clean_word = word.strip(punctuation).lower() # remove the special characters and then convert it to lowercase
if clean_word == '': # if the cleaned string is the empty string ignore it as well
continue
if clean_word in stopwords.words('english'): # ignore the stopwords
continue
#all_words.append(clean_word) # add the cleaned word to the list containing all the words
cur_tweet.append(clean_word) # add each cleaned word of the current tweet to a list
cleaned_tweet = " ".join(cur_tweet) # cleaned tweet reconstructed, using the cur_tweet list of words
all_tweets.append(cleaned_tweet) # add the cleaned tweet to the list of all tweets
return all_tweets
def get_labels(filepath):
text = open(filepath, 'r', encoding="utf8").read()
tweets_list = text.splitlines() # split the tweets in lines
labels = []
for tweet in tweets_list:
words = tweet.split()
if len(words) == 2: # *gold.txt
labels.append(words[1])
else:
labels.append(words[2])
return labels
def tweet2vector(w2vmodel, tokens_list, num_of_features, lexicon_1, lexicon_2):
vector = np.zeros(num_of_features).reshape( (1, num_of_features) ) # create a vector of num_of_features features and fill it with 0's
count = 0
lex_1 = 0
lex_1c = 0
lex_2 = 0
lex_2c = 0
max_valence = -1000
min_valence = 1000
if len(tokens_list) == 0: # tweet is empty after the cleaning - happens for some tweets
vector = w2vmodel['tomorrow'].reshape( (1, num_of_features) ) # assign to the tweet's vector, a vector of a neutral/very common word
# 'tomorrow' seems to give good results
for word in tokens_list:
if lexicon_1 is not None:
if word in lexicon_1:
lex_1 += lexicon_1[word]
lex_1c += 1
if lexicon_2 is not None:
if word in lexicon_2:
lex_2 += lexicon_2[word]
lex_2c += 1
try:
vector += w2vmodel[word].reshape( (1, num_of_features) )
average_valence = np.mean(w2vmodel[word].reshape( (1, num_of_features) ))
if max_valence < average_valence:
max_valence = average_valence
if min_valence > average_valence:
min_valence = average_valence
count += 1
except KeyError: # case where the token is not in the vocabulary(frequency of the token was < min_count)
continue
if count != 0:
vector /= count
if lex_1c != 0:
lex_1 /= lex_1c
vector = np.array([np.append(vector, np.array(lex_1))])
else:
vector = np.array([np.append(vector, np.array([[0]]))])
if lex_2c != 0:
lex_2 /= lex_2c
vector = np.array([np.append(vector, np.array(lex_2))])
else:
vector = np.array([np.append(vector, np.array([[0]]))])
average_valence = (max_valence + min_valence)/2
vector = np.array([np.append(vector, np.array(average_valence))])
return vector
def vader_sentiment_analysis(w2vmodel, original_tweets):
i = 0
model = []
for vector in w2vmodel:
if len(original_tweets[i]) > 0:
model.append(np.append(vector, np.array(sentiment_analyzer_scores(original_tweets[i][0]))))
else:
model.append(np.append(vector, np.array(0)))
i += 1
model = np.array(model)
return model