-
Notifications
You must be signed in to change notification settings - Fork 3
/
train_language_model.py
181 lines (130 loc) · 5.16 KB
/
train_language_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Program to Create a Neural language Model
# required imports
import os
import re
import warnings
import numpy as np
import pandas as pd
import pickle
from nltk import ngrams
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
warnings.simplefilter(action='ignore', category=FutureWarning)
def read_text(path):
""" Function to read input data
Args:
path (string): the parent path of the folder containing the input text files
Returns:
string: The complete text read from input files appended in a single string.
"""
text = ' '
for text_file in os.listdir(path):
text_file_buffer = open(os.path.join(
path, text_file), mode='r', encoding='utf-8')
text = text + text_file_buffer.read()
text_file_buffer.close()
# returning first 100000 reviews only because dataset is too large for training on a CPU.
return text[:100000]
def preprocess_text(text):
""" Function for basic cleaning and pre-processing of input text
Args:
text (string): raw input text
Returns:
string: cleaned text
"""
text = text.lower()
text = re.sub(r"'s\b", "", text)
text = re.sub("[^a-zA-Z]", " ", text)
text = ' '.join([word for word in text.split() if len(word) >= 3]).strip()
return text
def prepare_text(text):
""" Function to prepare text in sequence of ngrams
Args:
text (string): complete input text
Returns:
list : a list of text sequence with 31 characters each
"""
sequence = list(ngrams(text, 31))
sequence = [''.join(char for char in sub_sequence)
for sub_sequence in sequence]
return sequence
def encoding_sequence(text, sequence):
""" Function to encode the character sequence into number sequence
Args:
text (string): cleaned text
sequence (list): character sequence list
Returns:
dict: dictionary mapping of all unique input charcters to integers
list: number encoded charachter sequences
"""
mapping = dict((c, i) for i, c in enumerate(sorted(list(set(text)))))
encoded_sequence = [[mapping[char]
for char in sub_sequence] for sub_sequence in sequence]
return mapping, encoded_sequence
def split_data(mapping, encoded_sequence):
""" Function to split the prepared data in train and test
Args:
mapping (dict): dictionary mapping of all unique input charcters to integers
encoded_sequence (list): number encoded charachter sequences
Returns:
numpy array : train and test split numpy arrays
"""
encoded_sequence_ = np.array(encoded_sequence)
X, y = encoded_sequence_[:, :-1], encoded_sequence_[:, -1]
y = to_categorical(y, num_classes=len(mapping))
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=42)
return X_train, X_test, y_train, y_test
def train_model(mapping, X_train, X_test, y_train, y_test):
"""Function to train the model
Args:
mapping (dict): dictionary mapping of all unique input charcters to integers
X_train (ndarray): Training data feature split
X_test (ndarray): Testing data feature split
y_train (ndarray): training data target split
y_test (ndarray): testing data target split
Returns:
keras model : Trained keras model
"""
vocab = len(mapping)
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(50, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
model.summary()
model.compile(optimizer='adam',
loss='categorical_crossentropy', metrics=['accuracy'])
print("\n")
print(" !!Fitting the model !! ")
model.fit(X_train, y_train, epochs=20, verbose=2,
validation_data=(X_test, y_test))
return model
def save_model(mapping, model, path):
""" Function to save mapping and trained model
Args:
mapping (dict): dictionary mapping of all unique input charcters to integers
model (keras model): trained keras model
path (string): path where model will be stored
"""
model.save(path)
pickle.dump(mapping, open('models/mapping.pickle', 'wb'))
if __name__ == "__main__":
path = 'imdb_data/'
print(" !!.... Reading texts ....!! ")
text = read_text(path)
print(" !!.... Preprocessing texts ....!! ")
text = preprocess_text(text)
print(" !!.... Preparing texts ....!! ")
sequence = prepare_text(text)
print(" !!.... Encode sequence ....!! ")
mapping, encoded_sequence = encoding_sequence(text, sequence)
print(" !!.... Spliting data ....!! ")
X_train, X_test, y_train, y_test = split_data(
mapping, encoded_sequence)
print(" !!.... Training model ....!! ")
model = train_model(mapping, X_train, X_test, y_train, y_test)
print(" !!.... Saving model and Mapping ....!! ")
model_path = 'models/char_based_neural_lang_model.h5'
save_model(mapping, model, model_path)