-
Notifications
You must be signed in to change notification settings - Fork 38
/
readers.py
326 lines (273 loc) · 12.1 KB
/
readers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
"""
A Reader simply reads data from disk and returns it almost as is, based on
a "primary key", which for the case of VisDial v1.0 dataset, is the
``image_id``. Readers should be utilized by torch ``Dataset``s. Any type of
data pre-processing is not recommended in the reader, such as tokenizing words
to integers, embedding tokens, or passing an image through a pre-trained CNN.
Each reader must atleast implement three methods:
- ``__len__`` to return the length of data this Reader can read.
- ``__getitem__`` to return data based on ``image_id`` in VisDial v1.0
dataset.
- ``keys`` to return a list of possible ``image_id``s this Reader can
provide data of.
"""
import copy
import json
import multiprocessing as mp
from typing import Any, Dict, List, Optional, Set, Union
import h5py
# A bit slow, and just splits sentences to list of words, can be doable in
# `DialogsReader`.
from nltk.tokenize import word_tokenize
from tqdm import tqdm
class DialogsReader(object):
"""
A simple reader for VisDial v1.0 dialog data. The json file must have the
same structure as mentioned on ``https://visualdialog.org/data``.
Parameters
----------
dialogs_jsonpath : str
Path to json file containing VisDial v1.0 train, val or test data.
num_examples: int, optional (default = None)
Process first ``num_examples`` from the split. Useful to speed up while
debugging.
"""
def __init__(
self,
dialogs_jsonpath: str,
num_examples: Optional[int] = None,
num_workers: int = 1,
):
with open(dialogs_jsonpath, "r") as visdial_file:
visdial_data = json.load(visdial_file)
self._split = visdial_data["split"]
# Maintain questions and answers as a dict instead of list because
# they are referenced by index in dialogs. We drop elements from
# these in "overfit" mode to save time (tokenization is slow).
self.questions = {
i: question for i, question in
enumerate(visdial_data["data"]["questions"])
}
self.answers = {
i: answer for i, answer in
enumerate(visdial_data["data"]["answers"])
}
# Add empty question, answer - useful for padding dialog rounds
# for test split.
self.questions[-1] = ""
self.answers[-1] = ""
# ``image_id``` serves as key for all three dicts here.
self.captions: Dict[int, Any] = {}
self.dialogs: Dict[int, Any] = {}
self.num_rounds: Dict[int, Any] = {}
all_dialogs = visdial_data["data"]["dialogs"]
# Retain only first ``num_examples`` dialogs if specified.
if num_examples is not None:
all_dialogs = all_dialogs[:num_examples]
for _dialog in all_dialogs:
self.captions[_dialog["image_id"]] = _dialog["caption"]
# Record original length of dialog, before padding.
# 10 for train and val splits, 10 or less for test split.
self.num_rounds[_dialog["image_id"]] = len(_dialog["dialog"])
# Pad dialog at the end with empty question and answer pairs
# (for test split).
while len(_dialog["dialog"]) < 10:
_dialog["dialog"].append({"question": -1, "answer": -1})
# Add empty answer (and answer options) if not provided
# (for test split). We use "-1" as a key for empty questions
# and answers.
for i in range(len(_dialog["dialog"])):
if "answer" not in _dialog["dialog"][i]:
_dialog["dialog"][i]["answer"] = -1
if "answer_options" not in _dialog["dialog"][i]:
_dialog["dialog"][i]["answer_options"] = [-1] * 100
self.dialogs[_dialog["image_id"]] = _dialog["dialog"]
# If ``num_examples`` is specified, collect questions and answers
# included in those examples, and drop the rest to save time while
# tokenizing. Collecting these should be fast because num_examples
# during debugging are generally small.
if num_examples is not None:
questions_included: Set[int] = set()
answers_included: Set[int] = set()
for _dialog in self.dialogs.values():
for _dialog_round in _dialog:
questions_included.add(_dialog_round["question"])
answers_included.add(_dialog_round["answer"])
for _answer_option in _dialog_round["answer_options"]:
answers_included.add(_answer_option)
self.questions = {
i: self.questions[i] for i in questions_included
}
self.answers = {
i: self.answers[i] for i in answers_included
}
self._multiprocess_tokenize(num_workers)
def _multiprocess_tokenize(self, num_workers: int):
"""
Tokenize captions, questions and answers in parallel processes. This
method uses multiprocessing module internally.
Since questions, answers and captions are dicts - and multiprocessing
map utilities operate on lists, we convert these to lists first and
then back to dicts.
Parameters
----------
num_workers: int
Number of workers (processes) to run in parallel.
"""
# While displaying progress bar through tqdm, specify total number of
# sequences to tokenize, because tqdm won't know in case of pool.imap
with mp.Pool(num_workers) as pool:
print(f"[{self._split}] Tokenizing questions...")
_question_tuples = self.questions.items()
_question_indices = [t[0] for t in _question_tuples]
_questions = list(
tqdm(
pool.imap(word_tokenize, [t[1] for t in _question_tuples]),
total=len(self.questions)
)
)
self.questions = {
i: question + ["?"] for i, question in
zip(_question_indices, _questions)
}
# Delete variables to free memory.
del _question_tuples, _question_indices, _questions
print(f"[{self._split}] Tokenizing answers...")
_answer_tuples = self.answers.items()
_answer_indices = [t[0] for t in _answer_tuples]
_answers = list(
tqdm(
pool.imap(word_tokenize, [t[1] for t in _answer_tuples]),
total=len(self.answers)
)
)
self.answers = {
i: answer + ["?"] for i, answer in
zip(_answer_indices, _answers)
}
# Delete variables to free memory.
del _answer_tuples, _answer_indices, _answers
print(f"[{self._split}] Tokenizing captions...")
# Convert dict to separate lists of image_ids and captions.
_caption_tuples = self.captions.items()
_image_ids = [t[0] for t in _caption_tuples]
_captions = list(
tqdm(
pool.imap(word_tokenize, [t[1] for t in _caption_tuples]),
total=(len(_caption_tuples))
)
)
# Convert tokenized captions back to a dict.
self.captions = {i: c for i, c in zip(_image_ids, _captions)}
def __len__(self):
return len(self.dialogs)
def __getitem__(self, image_id: int) -> Dict[str, Union[int, str, List]]:
caption_for_image = self.captions[image_id]
dialog = copy.copy(self.dialogs[image_id])
num_rounds = self.num_rounds[image_id]
# Replace question and answer indices with actual word tokens.
for i in range(len(dialog)):
dialog[i]["question"] = self.questions[
dialog[i]["question"]
]
dialog[i]["answer"] = self.answers[
dialog[i]["answer"]
]
for j, answer_option in enumerate(
dialog[i]["answer_options"]
):
dialog[i]["answer_options"][j] = self.answers[
answer_option
]
return {
"image_id": image_id,
"caption": caption_for_image,
"dialog": dialog,
"num_rounds": num_rounds,
}
def keys(self) -> List[int]:
return list(self.dialogs.keys())
@property
def split(self):
return self._split
class DenseAnnotationsReader(object):
"""
A reader for dense annotations for val split. The json file must have the
same structure as mentioned on ``https://visualdialog.org/data``.
Parameters
----------
dense_annotations_jsonpath : str
Path to a json file containing VisDial v1.0
"""
def __init__(self, dense_annotations_jsonpath: str):
with open(dense_annotations_jsonpath, "r") as visdial_file:
self._visdial_data = json.load(visdial_file)
self._image_ids = [
entry["image_id"] for entry in self._visdial_data
]
def __len__(self):
return len(self._image_ids)
def __getitem__(self, image_id: int) -> Dict[str, Union[int, List]]:
index = self._image_ids.index(image_id)
# keys: {"image_id", "round_id", "gt_relevance"}
return self._visdial_data[index]
@property
def split(self):
# always
return "val"
class ImageFeaturesHdfReader(object):
"""
A reader for HDF files containing pre-extracted image features. A typical
HDF file is expected to have a column named "image_id", and another column
named "features".
Example of an HDF file:
```
visdial_train_faster_rcnn_bottomup_features.h5
|--- "image_id" [shape: (num_images, )]
|--- "features" [shape: (num_images, num_proposals, feature_size)]
+--- .attrs ("split", "train")
```
Refer ``$PROJECT_ROOT/data/extract_bottomup.py`` script for more details
about HDF structure.
Parameters
----------
features_hdfpath : str
Path to an HDF file containing VisDial v1.0 train, val or test split
image features.
in_memory : bool
Whether to load the whole HDF file in memory. Beware, these files are
sometimes tens of GBs in size. Set this to true if you have sufficient
RAM - trade-off between speed and memory.
"""
def __init__(self, features_hdfpath: str, in_memory: bool = False):
self.features_hdfpath = features_hdfpath
self._in_memory = in_memory
with h5py.File(self.features_hdfpath, "r") as features_hdf:
self._split = features_hdf.attrs["split"]
self._image_id_list = list(features_hdf["image_id"])
# "features" is List[np.ndarray] if the dataset is loaded in-memory
# If not loaded in memory, then list of None.
self.features = [None] * len(self._image_id_list)
def __len__(self):
return len(self._image_id_list)
def __getitem__(self, image_id: int):
index = self._image_id_list.index(image_id)
if self._in_memory:
# Load features during first epoch, all not loaded together as it
# has a slow start.
if self.features[index] is not None:
image_id_features = self.features[index]
else:
with h5py.File(self.features_hdfpath, "r") as features_hdf:
image_id_features = features_hdf["features"][index]
self.features[index] = image_id_features
else:
# Read chunk from file everytime if not loaded in memory.
with h5py.File(self.features_hdfpath, "r") as features_hdf:
image_id_features = features_hdf["features"][index]
return image_id_features
def keys(self) -> List[int]:
return self._image_id_list
@property
def split(self):
return self._split