-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_utilities.py
74 lines (62 loc) · 2.88 KB
/
model_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np # linear algebra
import pandas as pd
import warnings
warnings.filterwarnings("ignore") #Ignoring unnecessory warnings
import numpy as np #for large and multi-dimensional arrays
import pandas as pd #for data manipulation and analysis
import nltk #Natural language processing tool-kit
from nltk.corpus import stopwords #Stopwords corpus
from nltk.stem import PorterStemmer # Stemmer
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer #For TF-IDF
from gensim.models import Word2Vec #For Word2Vec
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
def essay_to_wordlist(essay_v, remove_stopwords):
"""Remove the tagged labels and word tokenize the sentence."""
essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
words = essay_v.lower().split()
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
return (words)
def essay_to_sentences(essay_v, remove_stopwords):
"""Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(essay_v.strip())
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
return sentences
def makeFeatureVec(words, model, num_features):
"""Make Feature Vector from the words list of an Essay."""
featureVec = np.zeros((num_features,),dtype="float32")
num_words = 0.
index2word_set = set(model.wv.index2word)
for word in words:
if word in index2word_set:
num_words += 1
featureVec = np.add(featureVec,model[word])
featureVec = np.divide(featureVec,num_words)
return featureVec
def getAvgFeatureVecs(essays, model, num_features):
"""Main function to generate the word vectors for word2vec model."""
counter = 0
essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
for essay in essays:
essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
counter = counter + 1
return essayFeatureVecs