Skip to content

Commit

Permalink
Fix bulldogjob scrapper (#33)
Browse files Browse the repository at this point in the history
* Fix bulldogjob recheable test

* Fix bulldogjob scrapper

* Update exception
  • Loading branch information
wkobiela committed Sep 9, 2023
1 parent 2f05968 commit 12e1217
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 23 deletions.
53 changes: 32 additions & 21 deletions scrappers/bulldogjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,41 @@ def updateJobsDict(self, url):
text = ""
try:
for page_num in range(1, max_pages + 1):
page = requests.get(url+f"/page,{page_num}", timeout=120)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/58.0.3029.110 Safari/537.36'
}
page = requests.get(url+f"/page,{page_num}", timeout=120, headers=headers)
page_soup = BeautifulSoup(page.content, "html.parser")
job_links_list = page_soup.find_all("a", {"href": re.compile('.*bulldogjob.pl/companies/jobs/.*')})
for job in job_links_list:
#workarount for false job objects
if not job.find('button', class_=re.compile("flex items-center w-full relative text-xs", re.I)):
continue
job_link = job.get('href')
job_title = job.find(name="h3", class_="text-18 font-extrabold leading-8 mr-8 md:mr-0")
job_title = job_title.find(text=True, recursive=False).text
job_company = job.find('div', class_=re.compile("text-xxs uppercase", re.I)).text
job_salary = job.find('div', class_=re.compile("lg:font-extrabold md:text-xl text-dm", re.I))
if job_salary.find(text=True, recursive=True) is not None:
job_salary = job_salary.find(text=True, recursive=True).text
else:
job_salary="Brak informacji"
job_overall_info = job.find_all('div', class_=re.compile("flex items-start", re.I))
for info in job_overall_info:
text = text + info.find('span').text + " / "
self.jobs_dict[job_link] = {"Title": [job_title],
"Company": [job_company],
"Salary": [job_salary],
"Location": [text]}
text = ""
try:
#workarount for false job objects
if not job.find('button', class_=re.compile("flex items-center w-full relative text-xs", re.I)):
continue
job_link = job.get('href')
job_title = job.find(name="h3",
class_="md:mb-5 lg:mb-0 text-18 font-extrabold leading-8 mr-8 md:mr-0")
if job_title is not None:
job_title = job_title.find(text=True, recursive=False).text
else:
job_title="Sprawdź regex."
job_company = job.find('div', class_=re.compile("text-xxs uppercase", re.I)).text
job_salary = job.find('div', class_=re.compile("lg:font-extrabold md:text-xl text-dm", re.I))
if job_salary.find(text=True, recursive=True) is not None:
job_salary = job_salary.find(text=True, recursive=True).text
else:
job_salary="Brak informacji"
job_overall_info = job.find_all('div', class_=re.compile("flex items-start", re.I))
for info in job_overall_info:
text = text + info.find('span').text + " / "
self.jobs_dict[job_link] = {"Title": [job_title],
"Company": [job_company],
"Salary": [job_salary],
"Location": [text]}
text = ""
except Exception as ie:
print(f"Exception {ie} on {job}.")
except Exception as e:
print(f"Exception {e} on updateJobsDict.")

Expand Down
8 changes: 6 additions & 2 deletions tests/test_linksReachable.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@ def test_is_NoFluffJobs_reachable():
assert response.status_code == 200

def test_is_BulldogJob_reachable():
link = createLinks(site='BulldogJob', role="qa", lvl="junior,mid", city="Remote,Gdańsk")
response = requests.get(link, timeout=120)
link = createLinks(site='BulldogJob', role="qa,tester", lvl="junior,medium", city="Remote,Gdańsk")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/58.0.3029.110 Safari/537.36'
}
response = requests.get(link, timeout=120, headers=headers)
assert response.status_code == 200

def test_is_JustJoinIt_reachable():
Expand Down

0 comments on commit 12e1217

Please sign in to comment.