-
Notifications
You must be signed in to change notification settings - Fork 0
/
autoScrapper.py
90 lines (76 loc) · 3.94 KB
/
autoScrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import requests
import json
from tqdm import tqdm # Import tqdm for the progress bar
def fetch_wikidata_details(entity_id):
# Currently using : genre, platform, gameMode, inputDevice for each game title
query = """
SELECT ?genreLabel ?platformLabel ?gameModeLabel ?inputDeviceLabel WHERE {
OPTIONAL { wd:""" + entity_id + """ wdt:P136 ?genre. }
OPTIONAL { wd:""" + entity_id + """ wdt:P400 ?platform. }
OPTIONAL { wd:""" + entity_id + """ wdt:P404 ?gameMode. }
OPTIONAL { wd:""" + entity_id + """ wdt:P479 ?inputDevice. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
url = 'https://query.wikidata.org/sparql'
headers = {'User-Agent': 'YourAppName/1.0 ([email protected])'} # Replace with your app's name and your email
r = requests.get(url, headers=headers, params={'format': 'json', 'query': query})
#print(r.content)
# Check HTTP status code for error handling
if r.status_code != 200:
print(f"Error fetching data for {entity_id}. HTTP Status Code: {r.status_code}")
print(r.text) # Print response text for debugging
return {'genre': '', 'platform': '', 'gameModes': '', 'inputDevice': ''} # Return empty details on error
try:
data = json.loads(r.content.decode("utf-8"))
except ValueError as e: # Catch JSON decode error
print(f"Error decoding JSON for {entity_id}: {e}")
print(r.text) # Print response text for debugging
return {'genre': '', 'platform': '', 'gameModes': '', 'inputDevice': ''} # Return empty details on error
# Proceed with data extraction if JSON was successfully decoded
details = {'genre': set(), 'platform': set(), 'gameModes': set(), 'inputDevice': set()}
for item in data['results']['bindings']:
if 'genreLabel' in item:
details['genre'].add(item['genreLabel']['value'])
if 'platformLabel' in item:
details['platform'].add(item['platformLabel']['value'])
if 'gameModeLabel' in item:
details['gameModes'].add(item['gameModeLabel']['value'])
if 'inputDeviceLabel' in item:
details['inputDevice'].add(item['inputDeviceLabel']['value'])
# Convert sets to semicolon-separated strings
for key in details:
details[key] = '; '.join(details[key])
return details
# Parameters
checkpoint_interval = 500
start_from_checkpoint = 39000 # Example: start from row 200; adjust this as needed
# Load the CSV file
df = pd.read_csv('Data/GameEntities.csv')
# Optionally, load enriched data from the last checkpoint if needed
enriched_data = []
if start_from_checkpoint > 0:
try:
checkpoint_df = pd.read_csv(f'Data/EnrichedGameEntities_checkpoint_{start_from_checkpoint}.csv')
enriched_data = checkpoint_df.to_dict('records')
except FileNotFoundError:
print(f"No checkpoint file found for index {start_from_checkpoint}. Starting from the beginning.")
# Enriching data with a progress bar and checkpointing
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Fetching Wikidata Details", initial=start_from_checkpoint):
if index < start_from_checkpoint:
continue # Skip rows before the checkpoint index
entity_details = fetch_wikidata_details(row['Entities'])
enriched_data.append({
'video_gameLabel': row['video_gameLabel'],
'Entities': row['Entities'],
**entity_details
})
# Checkpoint: Save progress at regular intervals
if (index + 1 - start_from_checkpoint) % checkpoint_interval == 0 or (index + 1) == df.shape[0]:
partial_df = pd.DataFrame(enriched_data)
partial_df.to_csv(f'Data/EnrichedGameEntities_checkpoint_{index + 1}.csv', index=False)
print(f"Checkpoint saved at index {index + 1}")
# Save the final complete dataframe at the end as well
enriched_df = pd.DataFrame(enriched_data)
enriched_df.to_csv('Data/EnrichedGameEntities_final.csv', index=False)