-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping.py
104 lines (83 loc) · 3.33 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
This script scrapes Dice.com for job postings and saves them as HTML files.
It is using `requests` for the scraping itself, and `multiprocessing` libraries for parallelization.
50 Workers are span-up and listen to a common queue for the scraping jobs.
Each job is initiated by a page identifier which is used to scrape Dice.com job exploration page.
Upon scraping the explore page, the job_ids are retrieved and scraped one by one.
As the work on each exploration page is independent on the others, the load can be parallelized.
A job page that were already scraped and save will not be scraped again by another worker.
"""
import json
import time
import warnings
import os
import requests
from requests.exceptions import HTTPError
from multiprocessing import Process, Queue
warnings.filterwarnings("ignore")
class Worker(Process):
def __init__(self, queue: Queue):
super().__init__()
self._queue = queue
self._proxies = {
'http': 'YOUR_PROXY_HERE',
'https': 'YOUR_PROXY_HERE',
}
def run(self):
while True:
args = self._queue.get()
if args is None:
break
self._process(args)
print(args, 'Done')
def _process(self, idx):
result = self._make_request(
f'https://job-search-api.svc.dhigroupinc.com/v1/dice/jobs/search?countryCode2=US&radius=5000&radiusUnit=km&page={idx}&pageSize=100&facets=employmentType%7CpostedDate%7CworkFromHomeAvailability%7CworkplaceTypes%7CemployerType%7CeasyApply%7CisRemote&fields=id&culture=en&recommendations=true&interactionId=0&fj=true&includeRemote=true&eid=100465514_1004160904')
if not result:
return
jobs_html = result.text
for job_id in self._find_all_job_ids(jobs_html):
if not self._is_job_already_found(job_id):
job = self._get_job_details(job_id)
with open(f'dice/{job_id}.html', 'w', encoding='utf-16') as out_file:
out_file.write(job)
print(job_id)
def _make_request(self, url):
while True:
try:
result = requests.get(url, proxies=self._proxies, verify=False, headers={
'X-Api-Key': '1YAt0R9wBg4WfsF9VB2778F5CHLAPMVW3WAZcKd8'
})
except Exception as e:
time.sleep(0.1)
continue
# checking for errors -> trying again if needed
try:
result.raise_for_status()
except HTTPError as e:
if result.status_code not in [429, 451]:
return None
continue
return result
def _find_all_job_ids(self, text):
data = json.loads(text)
return [item['id'] for item in data['data']]
def _get_job_details(self, job_id):
url = 'https://www.dice.com/job-detail/' + job_id
result = self._make_request(url)
return result.text if result else ''
def _is_job_already_found(self, job_id):
return os.path.exists(f'dice/{job_id}.html')
if __name__ == '__main__':
jobs = []
queue = Queue()
for i in range(50):
p = Worker(queue)
jobs.append(p)
p.start()
for i in range(1, 100):
queue.put(i)
for j in jobs:
queue.put(None)
for j in jobs:
j.join()