-
Notifications
You must be signed in to change notification settings - Fork 10
/
scraper.py
337 lines (243 loc) · 8.82 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
# -*- coding: utf-8 -*-
import requests
import json
import traceback
from firebase import firebase
from urlparse import urlparse, urljoin
from datetime import datetime
import sqlalchemy
import bleach
from hnapp import app, db
from models.item import Item
from models.lost_item import LostItem
from models.status import Status
from utils import debug_print
from errors import AppError, ScraperError
class Scraper(object):
firebase = None
base_url = 'https://hacker-news.firebaseio.com/v0/'
def connect(self):
"""
Connect to firebase API
You must call this before using the API
<<< TODO: call this from constructor
"""
if not self.firebase:
self.firebase = firebase.FirebaseApplication(self.base_url, None)
def save_newest_items(self):
"""
Get list of item dicts from the newest items by id
Includes all acceptable kinds of items (ignores pollopts)
Saves items in order of ascending ID – important for parent_id and root_id references
"""
# Get latest item ids from db and from api
max_api_id = self.fetch_max_item_id()
max_db_id = Status.get_max_item_id()
# If no new data, quit. We don't want to use this function to update items
if (max_api_id <= max_db_id):
return
# Save latest items and update max id
save = lambda item_data: self.save_item(item_data, update_max_id=True)
self.fetch_items(range(max_db_id+1, max_api_id), callback=save)
def save_newest_existing_stories(self, start_from=0, count=100, min_delay=0):
"""
Save latest stories
Does not fetch new stories, uses only those available in database
Basically this updates story scores
"""
# Generate list of newest stories
stories = (db.session.query(Item)
.with_entities(Item.id)
.filter(Item.kind == 'story')
.filter(Item.deleted == 0, Item.dead == 0)
.order_by(sqlalchemy.desc(Item.id))
.slice(start_from, count+start_from)
.all()
)
save = lambda item_data: self.save_item(item_data)
self.fetch_items([item.id for item in stories], callback=save, min_delay=min_delay)
def save_top_stories(self, front_page, start_from=0, count=100, min_delay=0):
"""
Save top stories (ranked by front page order)
Must indicate whether we're loading front page or not
"""
# Generate list of top stories
story_ids = self.fetch_top_story_ids()[start_from:start_from+count]
# Save top stories
save = lambda item_data: self.save_item(item_data, front_page=front_page)
# def save_item = function(item):
# if (datetime.utcnow() - item.date_updated).total_seconds() > min_delay:
# self.save_item(item_data)
stories = self.fetch_items(story_ids, callback=save, min_delay=min_delay)
def save_item(self, item_data, update_max_id=False, front_page=False):
"""
Compile and save item to the database
Each item is committed separately for better fault tolerance
"""
item = None
# If item was lost
if isinstance(item_data, LostItem):
if not item_data.id:
return None
item = item_data
db.session.add(item)
debug_print("Lost %s because %s" % (item_data.id, item_data.reason), '\n')
else:
if 'id' not in item_data:
debug_print('Skipping item because empty id', '\n')
return None
if item_data.get('type', None) in ('story', 'comment', 'poll', 'job', None):
debug_print("Saving %d" % item_data['id'], '\n')
compiled_data = self.compile_item_data(item_data, front_page)
item = Item.create_or_update(compiled_data)
else:
debug_print("Skipping %s %d" % (item_data['type'], item_data['id']), '\n')
if update_max_id:
Status.set_max_item_id(item_data.id if isinstance(item_data, LostItem) else item_data['id'])
db.session.commit()
return item
def bleach_html(self, html):
"""
Sanitize HTML. Leave only safe/expected markup
"""
return bleach.clean(
'<p>' + html.replace('<p>', '</p>\n\n<p>') + '</p>',
tags=('a', 'i', 'p', 'pre'),
attributes={'a': ['href']},
styles=(),
strip=True
).replace('<p></p>', '')
def compile_item_data(self, raw_item, front_page):
"""
Convert raw API output for item to a source dict for an Item model
"""
# fields map {api_field: hnapp_field}
fields = {
'id': 'id',
'parent': 'parent_id',
'type': 'kind',
'title': 'title',
'text': 'raw_body',
'by': 'author',
'score': 'score',
'dead': 'dead',
'deleted': 'deleted'
}
item_data = {}
# Set standard fields listed above
for raw_field, model_field in fields.iteritems():
if raw_field in raw_item:
item_data[model_field] = raw_item[raw_field]
# Set time
if 'time' in raw_item:
item_data['date_posted'] = datetime.fromtimestamp(raw_item['time'])
# Set sanitized body
if 'text' in raw_item:
item_data['body'] = self.bleach_html(raw_item['text'])
# Set URL and domain
# Some of the possible "broken" URLs: None, "", " "
item_data['url'] = None
item_data['domain'] = None
if raw_item.get('url', None) is not None:
parsed_url = urlparse(raw_item['url'])
if parsed_url.hostname is not None:
item_data['domain'] = parsed_url.hostname.lower()
item_data['url'] = raw_item['url'] # <<< TODO should we use urlunparse here? https://docs.python.org/2/library/urlparse.html#urlparse.urlunparse
if item_data['domain'][:4] == 'www.':
item_data['domain'] = item_data['domain'][4:]
# Detect broken stories
if raw_item.get('type', None) != 'comment' and 'title' not in raw_item:
item_data['deleted'] = True
# Set kind and subkind
# item types map {api_type: [hnapp_kind, hnapp_subkind]}
item_types = {
'comment': ['comment', 'comment'],
'story': ['story', 'link'],
'poll': ['story', 'poll'],
'job': ['story', 'job'],
None: ['story', 'link'] # <<< TODO broken item... maybe these fields should be nullable in DB
}
item_data['kind'], item_data['subkind'] = item_types[raw_item.get('type', None)]
# Special treatment for ask/show stories
if item_data['subkind'] == 'link' and not item_data.get('deleted', False):
if 'text' in raw_item and item_data.get('domain', None) is None:
# Semi-killed items like 8549613 can have no URL, but also don't have a text field
# Non-broken items on the other hand do have the text field, empty of unapplicable
item_data['subkind'] = 'ask'
if item_data['title'].lower()[0:8] == 'show hn:':
item_data['subkind'] = 'show'
# Set child ids
if 'kids' in raw_item:
item_data['child_ids'] = ','.join(str(child_id) for child_id in raw_item['kids'])
# Set date when item entered and left front page
if front_page:
item_data['date_entered_fp'] = datetime.utcnow()
# Restore non-deleted and non-dead status
# Also cast those to int-s for postgresql
if 'deleted' not in item_data:
item_data['deleted'] = 0
else:
item_data['deleted'] = int(item_data['deleted'])
if 'dead' not in item_data:
item_data['dead'] = 0
else:
item_data['dead'] = int(item_data['dead'])
return item_data
def fetch_max_item_id(self):
"""
Fetch max item id available via HN Firebase API
"""
debug_print(">> fetch_max_item_id")
max_id = self.firebase.get('maxitem', None)
debug_print(max_id, '\n')
return max_id
def fetch_item(self, item_id):
"""
Fetch item data by id
Might return an instance of LostItem in case of API or HTTP error
"""
debug_print(">> fetch_item %d" % item_id)
try:
item = self.firebase.get('item', item_id)
if item is None:
item = LostItem(id=item_id,
reason='null'
)
return item
except requests.exceptions.HTTPError as e:
# If API error encountered, return a LostItem instead
lost_item = db.session.query(LostItem).get(item_id)
if lost_item is None:
lost_item = LostItem(id=item_id,
reason='HTTP/%s' % e.response.status_code,
response=e.response.text,
traceback=traceback.format_exc()
)
return lost_item
def fetch_items(self, item_ids, callback=None, min_delay=0):
"""
Fetch items for item ids
Output format: {id1: {item1}, id2: {item2}, ...}
Failed items might appear as LostItem instances instead of attribute dictionaries
"""
debug_print(">> fetch_items")
items = {}
for item_id in item_ids:
# Skip items that have been recently updated, if requested
if min_delay > 0:
db_item = db.session.query(Item).get(item_id)
if db_item is not None and min_delay > (datetime.utcnow() - db_item.date_updated).total_seconds():
debug_print("Skipped item %d because it's too fresh" % item_id)
continue
api_item = self.fetch_item(item_id)
items[item_id] = api_item
if callback is not None:
callback(api_item)
return items
def fetch_top_story_ids(self):
"""
Fetch stories from top 100 items on front page
Ordered by current front page rank
"""
debug_print(">> fetch_top_story_ids")
return self.firebase.get('topstories', None)