This repository has been archived by the owner on Oct 6, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
load_data.py
201 lines (160 loc) · 5.47 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python2
from __future__ import print_function
import sys
import numpy as np
import random
def precompute(filename=None,
multiclass=False,
balance=True,
window_step=None,
window_size=None):
"""
Precompute some data that we need in order to lazy batch our data from
disk directly into our model. Returns (larger_class, remove_items, N)
"""
f = open(filename or sys.argv[1], 'r')
# count the total number of windows
N = 0
# balancing classes via downsampling larger class
larger_class = None
# count how many of larger class to remove to balance
remove_items = 0
if balance and not multiclass:
print("Pre-calculating class balance requirements")
total_True = 0
total_False = 0
text = f.read(window_step)
while True:
# chunk = data[data_ix:data_ix+window_size]
chunk = f.read(window_step)
if not chunk:
break
text += chunk
text = text[-window_size:]
if "\n" in chunk:
total_True += 1
else:
total_False += 1
N += 1
if total_True > total_False:
larger_class = 'True'
elif total_True < total_False:
larger_class = 'False'
if larger_class is not None:
remove_items = abs(total_True - total_False)
N -= remove_items
print( "T\t", total_True)
print( "F\t", total_False)
if larger_class is not None:
print("R\t", remove_items)
print("C\t", larger_class)
else:
text = f.read(window_step)
while True:
# chunk = data[data_ix:data_ix+window_size]
chunk = f.read(window_step)
if not chunk:
break
N += 1
print("N total windows", N)
# in multiclass we only use windows containing breaks
if multiclass:
new_N = 0
data_ix = 0
for i in range(N):
chunk = data[data_ix:data_ix+window_size]
# print(chunk.replace('\n', '\\n'))
data_ix += window_step
if multiclass and ("\n" not in chunk):
continue
new_N += 1
print('old N', N, 'new N', new_N)
N = new_N
f.close()
return larger_class, remove_items, N
def gen_training_data(filename=None,
multiclass=False,
balance=True,
larger_class=None,
remove_items=0, N=0,
window_step=None,
window_size=None,
batch_size=None):
# NOTE: this only works with balance set to True
print('Loading data...')
f = open(filename or sys.argv[1], 'r')
print("Beginning data vectorization")
space = " " * 5
# window_size = 3
# window_step = 2
# batch_size = 4
#
# ...
# ,..
# ,..
# ,..
EOF = False
total = 0
while not EOF:
X = np.zeros(shape=(batch_size, window_size), dtype='uint8')
if not multiclass:
y = np.zeros(shape=(batch_size, 1), dtype='uint8')
else:
y = np.zeros(shape=(batch_size, window_size), dtype='uint8')
text = f.read(window_step)
if not text:
break
batch_i = 0
while batch_i < batch_size:
piece = f.read(window_step)
if not piece:
EOF = True
break
# pad the last window if we pull up short
# this is required because of the splicing below
if len(piece) < window_step:
piece += ' ' * (len(piece) - window_step)
text += piece
chunk = text[-window_size:]
y_value = "\n" in chunk
if multiclass and not y_value:
continue
# only balance for binary classification (for now)
# do balancing randomly so we get a better mix of classes
if balance and str(y_value) == larger_class and remove_items > 0:
# spread the downsampling out across the whole dataset
if (random.random() * N) > (N / 3.0):
remove_items -= 1
if remove_items == 0:
print("Classes balanced at", i)
continue
if not multiclass:
y[batch_i] = 1 if y_value else 0
else:
y[batch_i][chunk.index("\n")] = 1
for j in range(window_size):
if j >= len(chunk):
break
c = chunk[j]
if c == "\n":
c = " "
o = ord(c)
# print('i', i, 'j', j, 'o', o, 'data_ix', data_ix)
X[batch_i][j] = o
batch_i += 1
total += 1
if total % 100 == 0:
print("Processed: %s Imbal: %s" % (total, remove_items), end='\r')
yield X, y
f.close()
# print('Splitting test/train')
# x_train = X[:int(len(X) * 0.75)]
# x_test = X[int(len(X) * 0.75):]
# y_train = y[:int(len(y) * 0.75)]
# y_test = y[int(len(y) * 0.75):]
# n_true = y.sum()
# n_false = y.shape[0] - n_true
# if not multiclass:
# print('True', n_true, 'False', n_false, '%', float(n_true) / n_false)
# del data; del X; del y
# return x_train, y_train, x_test, y_test