-
Notifications
You must be signed in to change notification settings - Fork 0
/
myutils_V4.py
358 lines (255 loc) · 13.9 KB
/
myutils_V4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
# encoding=utf-8
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Input, Flatten, Concatenate
from tensorflow.keras.models import Model
import gensim.downloader as api
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from gensim.models import word2vec
'''
'''
##################################################
# preprocess
##################################################
## The stop_words will be deleted on the TFIDFvectorizer
def text_process(filename):
regex_web = re.compile(r'http://.*\s+') # 结尾的.表示任意字符(包括\t\r\f\n) 但是也会匹配空格. 我们这里还好, 都是独立的urls
pattern = r'[a-zA-Z][-._a-zA-Z]*[a-zA-Z]' # allow several times of hyphen, dot, single quote such as "Te--rm..ina"
with open(filename, 'r', encoding='utf-8') as f:
t1 = f.read().lower() # 转换为小写
f.close()
t2 = re.sub('\n{2,}', '\n\n', t1) # 将连续2个以上的换行符作为split标准
print("Line indicator processing: Deleted # characters: ", len(t1) - len(t2))
t3 = re.sub(regex_web, "", t2) # 去除网站
print("Urls processing: Deleted # characters: ", len(t2) - len(t3))
t4 = t3.split('\n\n') # 分开段落
print("# Paragraph: ", len(t4))
cleaned = [" ".join(re.findall(pattern, para)) for para in t4] # 找到每个段落中的单词 (可以顺便去除stopword), 还是留在tfidf_vectorizer
# 去除多余空格
# 替换数字为x
return cleaned
## Delete the noise sample, and construct the paragraph; I think the short sentence is not reasonable to regard as a paragraph, so the number of characters threshold is set with 40
def para_filter(dt1, threshold_len=40):
tt = pd.DataFrame(dt1)
leng = tt.iloc[:, 0].str.len()
# plt.plot(leng)
deleted_dt = tt[leng < threshold_len]
print("Under threshold, # possible paragraphs deleted: ", len(deleted_dt))
samples = tt[leng >= threshold_len].reset_index(drop=True)
return samples, deleted_dt
# dt1 = text_process(wd+filename[0])
# dtt1,deleted_dt1 = para_filter(dt1, threshold_len=100)
# deleted_dt1
# # DataSet construct
# dttt = pd.DataFrame(np.empty((0,1),float)) # Final dataset
# for i in range(len(filename)):
# print("\n\n========================================================")
# dt1 = text_process(wd+filename[i])
# dtt1,deleted_dt1 = para_filter(dt1, threshold_len=5) # 避免稀疏!!!!
# print('\nLabel:', str(i),'\nFor this document', filename[i], "it perserved # of samples: ", dtt1.shape[0],'\n')
# dtt1['label'] = i
# dttt = pd.concat([dttt,dtt1], axis=0, ignore_index=True)
# dttt
##################################################
# EDA
##################################################
def eda_MAX_NB_WORDS(corpus, ratio = 0.95,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', char_level=False):
'''
Input: list of sentences (string type)
'''
### 确认词长
# ratio = 0.95
# corpus = train_5k
tokenizer_eda = Tokenizer(num_words=None, filters=filters, lower=True, char_level=char_level) # 如果有这个, NLTK的preprocessing可以不用做
tokenizer_eda.fit_on_texts(corpus)
b = pd.DataFrame(tokenizer_eda.word_counts.items(),columns=['word','count'])
a = b.sort_values(by='count',ascending=False).reset_index() # 排序重建index 就是 tokenizer中的word_index
############# 累加百分比 可视化
plt.figure(figsize=(20,5))
word_distribution = a['count'].cumsum()/a['count'].sum() # 求累加百分比
word_distribution.plot() # 出图
cut_index = np.argmin(abs(word_distribution-ratio)) # 找到离0.8最近的index位置
plt.plot([cut_index,cut_index],[0,ratio]) # 找出 固定 ratio 的index
plt.plot([0,cut_index],[ratio,ratio])
plt.xlabel("word_index") # 需要先sort, 才能说是index of words.
plt.ylabel("word_cum_counts_perc")
plt.title("MAX_NB_WORDS Cumsum Percentage")
plt.show()
############# 大概取词范围 可视化
plt.figure(figsize=(20,5))
b.iloc[:,1].plot() # 出图
plt.plot([cut_index,cut_index],[0,max(b['count'])]) # 找出 固定 ratio 的index
plt.plot([0,cut_index],[max(b['count']),max(b['count'])])
plt.xlabel("word_index") # 需要先sort, 才能说是index of words.
plt.ylabel("word_count")
plt.title("MAX_NB_WORDS Percentage")
plt.show()
print("Cut index with", ratio*100 , "% of corpus: ", cut_index, '\n')
# stopwords?
print(a.sort_values(by='count', ascending=False).head(20))
return int(cut_index)
# return int(cut_index)+1
# eda_MAX_NB_WORDS(corpus = filtered_corpus, ratio = 0.95)
def eda_MAX_DOC_LEN(corpus, ratio=0.9, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', char_level=False):
'''
Input: list of sentences (string type)
'''
# MAX_DOC_LEN=30
# corpus = train_5k
tokenizer_eda = Tokenizer(num_words=None, filters=filters,lower=True, char_level=char_level)
tokenizer_eda.fit_on_texts(corpus)
dt_q1 = pd.DataFrame([len(i) for i in tokenizer_eda.texts_to_sequences(corpus)], columns=['length'])
c = dt_q1['length'].value_counts().sort_index() # 频数统计, 且按index重新排序
sent_cdf = c.cumsum() / c.sum()
sent_pdf = c / c.sum()
cut_index = np.argmin(abs(sent_cdf - ratio)) # 找到离0.8最近的index位置
############# 累加百分比 可视化
plt.figure(figsize=(20, 5))
sent_cdf.plot() # 出图
plt.plot([cut_index, cut_index], [0, ratio]) # 找出 固定 ratio 的index
plt.plot([0, cut_index], [ratio, ratio])
plt.xlabel("word_length") # 需要先sort, 才能说是index of words.
plt.ylabel("word_cum_counts_perc")
plt.title("MAX_DOC_LEN CDF")
plt.show()
plt.figure(figsize=(20, 5))
sent_pdf.plot() # 出图
plt.plot([cut_index, cut_index], [0, max(sent_pdf)]) # 找出 固定 ratio 的index
plt.plot([0, cut_index], [max(sent_pdf), max(sent_pdf)]) # 横线
plt.xlabel("word_length") # 需要先sort, 才能说是index of words.
plt.ylabel("word_counts_perc")
plt.title("MAX_DOC_LEN PDF")
plt.show()
print("Cut index with", ratio * 100, "% of corpus: ", cut_index)
return int(cut_index)
# eda_MAX_DOC_LEN(corpus = filtered_corpus, ratio=0.9)
##################################################
# Dataset prepare class
##################################################
class text_preprocessor(object): # stopwords 在NLTK的 tokenizer中才有. 所以用keras的话, 就预处理的时候, 把stopwords去掉
def __init__(self, doc_len, max_words, docs, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',char_level=False, zero_pad = '_'):
'''
initialize a processor
input: a sequence of string (Training dataset)
processor = text_preprocessor(MAX_DOC_LEN, MAX_NB_WORDS, sentences_train)
'''
self.MAX_DOC_LEN = doc_len
self.MAX_NB_WORDS = max_words
self.tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS,filters=filters, char_level=char_level)
self.tokenizer.fit_on_texts(docs)
self.corpus = docs
if zero_pad: # for cnn this is not needed
self.tokenizer.word_index[zero_pad] = 0
self.tokenizer.index_word[0] = zero_pad
self.index_word = self.tokenizer.index_word
self.word_index = self.tokenizer.word_index
def __repr__(self): # print(自己的时候)出来下面的东西
return 'A class which has method:\n' \
'generate_seq(sentences_train) \n' \
'w2v_pretrain(dimension of embedding)\n' \
'load_glove_w2v(dimension of embedding)\n' \
''
def generate_seq(self, docs, padding='post', truncating='post'):
sequences = self.tokenizer.texts_to_sequences(docs)
# padded_sequences = pad_sequences(sequences, maxlen=self.MAX_DOC_LEN, padding='post')
padded_sequences = pad_sequences(sequences, maxlen=self.MAX_DOC_LEN, padding=padding, truncating=truncating)
# 这里不用+1
return padded_sequences
def w2v_pretrain(self, EMBEDDING_DIM): #
## Generate pretrained Embedding with all of tokens in training sentences
wv_model = word2vec.Word2Vec(sentences=self.corpus, min_count=1, seed=1, cbow_mean=1,
size=EMBEDDING_DIM, negative=5, window=5, iter=30,
workers=8) # Based on tokens in all sentences, training the W2V # sg = 1 为 skipgram
NUM_WORDS = min(self.MAX_NB_WORDS,
len(self.tokenizer.word_index)) # Keep the # of highest freq of words, in this case : 2500
embedding_matrix = np.zeros((NUM_WORDS + 1, EMBEDDING_DIM)) # "+1" is for padding symbol that equal 0
# embedding_matrix = np.zeros((NUM_WORDS, EMBEDDING_DIM)) # RNN not needed
for word, i in self.tokenizer.word_index.items():
if i >= NUM_WORDS:
continue
if word in wv_model.wv:
embedding_matrix[i] = wv_model.wv[word] # load pretrained embedding on my indexing table embedding
PRETRAINED_WORD_VECTOR = embedding_matrix
return PRETRAINED_WORD_VECTOR
def load_glove_w2v(self, EMBEDDING_DIM):
word_vectors = api.load(
"glove-wiki-gigaword-" + str(EMBEDDING_DIM)) # load pre-trained word-vectors from gensim-data
NUM_WORDS = min(self.MAX_NB_WORDS, len(self.tokenizer.word_index)) # Keep the # of highest freq of words, in this case : 2500
embedding_matrix = np.zeros((NUM_WORDS + 1, EMBEDDING_DIM)) # "+1" is for padding symbol that equal 0
# embedding_matrix = np.zeros((NUM_WORDS, EMBEDDING_DIM)) # RNN not needed
for word, i in self.tokenizer.word_index.items():
if i >= NUM_WORDS: # index 超过2500 扔掉
continue
if word in word_vectors.wv: # "i" in word_vectors.wv
embedding_matrix[i] = word_vectors[word] # load pretrained embedding on my indexing table embedding
PRETRAINED_WORD_VECTOR = embedding_matrix
return PRETRAINED_WORD_VECTOR
##################################################
# define CNN part
##################################################
def cnn_model(FILTER_SIZES, MAX_NB_WORDS, MAX_DOC_LEN, NAME='cnn_base', EMBEDDING_DIM=200, NUM_FILTERS=64,
PRETRAINED_WORD_VECTOR=None, trainable_switch=True, bert_embedding=True):
model = None
main_input = Input(shape=(MAX_DOC_LEN,), dtype='int32', name='main_input')
if (PRETRAINED_WORD_VECTOR is not None):
embed_1 = Embedding(input_dim=MAX_NB_WORDS , output_dim=EMBEDDING_DIM, embeddings_initializer='uniform',
input_length=MAX_DOC_LEN, name='pretrained_embedding_trainable'
, weights=[PRETRAINED_WORD_VECTOR], trainable=trainable_switch)(main_input)
else: # 默认trainable
embed_1 = Embedding(input_dim=MAX_NB_WORDS , output_dim=EMBEDDING_DIM, embeddings_initializer='uniform',
input_length=MAX_DOC_LEN, name='embedding_trainable'
, trainable=True)(main_input)
# 这个+1 留到外面做
# embed_1 = Embedding(input_dim=MAX_NB_WORDS + 1, output_dim=EMBEDDING_DIM, embeddings_initializer='uniform',
# input_length=MAX_DOC_LEN, name='embedding_trainable'
# , trainable=True)(main_input) # Convolution-pooling-flat block
conv_blocks = []
for f in FILTER_SIZES: # For every filter
conv = Conv1D(filters=NUM_FILTERS, kernel_size=f, name='conv_' + str(f) + '_gram', strides=1,
activation='relu')(
embed_1) # convolution # filter-kernal extracting 64 features with ReLU activation function
pool = MaxPooling1D(pool_size=MAX_DOC_LEN - f + 1, name='pool_' + str(f) + '_gram')(
conv) # maxpooling size = MAX_DOC_LEN - filter_size + 1
flat = Flatten(name='flat_' + str(f) + '_gram')(
pool) # flatten filters extracting features (size*number = 3*64)
conv_blocks.append(flat)
if len(conv_blocks) > 1:
z = Concatenate(name='concate')(conv_blocks) # Concatenate的 input 是一个 list [flat_1, flat_2, flat_3]
else:
z = conv_blocks[0]
# pred = Dense(3, activation='softmax')(z)
model = Model(inputs=main_input, outputs=z, name=NAME)
return model
# testing
# cnn_base = cnn_model(FILTER_SIZES=[2,3,4], NUM_FILTERS=64, MAX_DOC_LEN=MAX_DOC_LEN, MAX_NB_WORDS=MAX_NB_WORDS, EMBEDDING_DIM=300)
# cnn_base.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
# cnn_base.fit(x=X_test,y=y_test,batch_size=20)
# cnn_base.summary()
# plot_model(cnn_base,show_shapes=True)
##################################################
# history plot
##################################################
def history_plot(training, extra_metric=None):
################## plot training history
# dic = ['val_loss', 'loss', 'val_acc', 'acc', "val_auroc"] # print(training.history)
# loss: 0.8109 - acc: 0.6362 - auroc: 0.7960 - val_loss: 0.6793 - val_acc: 0.7144 - val_auroc: 0.8684
dic = list(training.history.keys())
if extra_metric is not None:
idx = [[0,3],[1,4],[2,5]]
else:
idx = [[0,2],[1,3]]
for i,j in idx:
print("========================================================================")
print(dic[i],dic[j])
xx = list(range(1,len(training.history[dic[i]])+1))
plt.plot(xx,training.history[dic[i]], color = 'navy', lw = 2, label = 'Model_'+str(dic[i]))
plt.plot(xx,training.history[dic[j]], color = 'darkorange', lw = 2, label = 'Model_'+str(dic[j]))
plt.title(str(dic[i]) + "v.s. training_" + str(dic[j]))
plt.xlabel('Epochs')
plt.ylabel(str(dic[i]))
plt.legend()
plt.show();
return None