-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_matched_rest_val.py
64 lines (51 loc) · 2.82 KB
/
find_matched_rest_val.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import json
from tqdm import tqdm
"""
Given vqa.val1000.jsonl or vqa.rest_val.jsonl used in BEiT3 model as a validation subset,
the purpose of this script is to find the corresponding questions and annotations in vqa.val1000.jsonl
and ensure that we are evaluating our algorithm on the same subset for a fair comparison.
"""
# Load vqa.val1000.jsonl
vqa_annotations = []
with open('/tmp/datasets/coco/vqa.rest_val.jsonl', 'r') as file:
for line in file:
vqa_annotations.append(json.loads(line))
# Load v2_OpenEnded_mscoco_val2014_questions.json
with open('/tmp/datasets/coco/vqa/v2_OpenEnded_mscoco_val2014_questions.json', 'r') as file:
coco_questions = json.load(file)
# Function to extract image_id from the image_path
def extract_image_id(image_path):
parts = image_path.split('_')[-1] # Split by underscore and take the last part
image_id = int(parts.split('.')[0]) # Remove file extension and convert to int
return image_id
# Match annotations
matched_questions = []
for vqa_ann in tqdm(vqa_annotations):
image_id = extract_image_id(vqa_ann['image_path'])
qid = vqa_ann['qid']
for question in coco_questions['questions']:
if question['image_id'] == image_id and question['question_id'] == qid:
matched_questions.append(question)
break # Assuming each qid is unique, break after finding a match
# Assuming we have the matched_questions from the previous step
# Let's extract the question_ids from matched_questions
matched_question_ids = [question['question_id'] for question in matched_questions]
# Writing matched questions to a new file
with open('/tmp/datasets/coco/vqa/v2_OpenEnded_mscoco_rest_val2014_questions.json', 'w') as outfile:
json.dump({"questions": matched_questions}, outfile)
print(f"Stored {len(matched_questions)} matched questions in v2_OpenEnded_mscoco_rest_val2014_questions.json")
# ______________________________________________________________________________________________________________________
# Load v2_mscoco_val2014_annotations.json
with open('/tmp/datasets/coco/vqa/v2_mscoco_val2014_annotations.json', 'r') as file:
coco_annotations = json.load(file)
# Prepare a dict to hold matched annotations keyed by question_id
matched_annotations = {}
for qid in tqdm(matched_question_ids):
for ann in coco_annotations['annotations']:
if ann['question_id'] == qid:
matched_annotations[str(qid)] = ann
break # Assuming each question_id is unique, break after finding a match
# Writing matched annotations (as a dictionary) to a new file
with open('/tmp/datasets/coco/vqa/v2_mscoco_rest_val2014_annotations.json', 'w') as outfile:
json.dump(matched_annotations, outfile, indent=4) # Use indent for pretty printing
print(f"Stored {len(matched_annotations)} matched annotations in v2_mscoco_rest_val2014_annotations.json")