Skip to content

Commit

Permalink
Fix JustjoinIT scrapper (#48)
Browse files Browse the repository at this point in the history
* First update on justjoinit scrapper
* Update runner
* Fix nofluffjobs scrapper
* Fix bulldogjob scrapper
* Reenable tests
  • Loading branch information
wkobiela committed Jan 7, 2024
1 parent 9f50292 commit 351ce26
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 75 deletions.
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# jobScrapper

### JustJoinIt scrapper temporarily disabled, API is unreachable - no further information, if it will be back.

## Description
Simple python project, that should make it easier to be up to date with jobs offers. Websites like BulldogJob, Nofluffjobs or JustJoinIt have this nasty fature - job offers that are "refreshed" are bumped to the top of the page, so it is easy to get lost of track and even apply to the same job twice.

Expand Down Expand Up @@ -66,13 +64,13 @@ bulldogjob_settings = {
### JustJoinIt

To setup justnoinit scrapper, insert 3 MAIN parameters.
- role (list of strings) from available: `'testing', 'net', 'architecture', 'ruby', 'php', 'mobile', 'other', 'analytics', 'erp', 'go', 'admin', 'scala', 'pm', 'support', 'data', 'java', 'security', 'game', 'python', 'ux', 'c', 'javascript', 'devops', 'html'`
- lvl (list of strings) from avaliable: `'junior', 'mid', 'senior'`
- role (single string) from available: `'testing', 'net', 'architecture', 'ruby', 'php', 'mobile', 'other', 'analytics', 'erp', 'go', 'admin', 'scala', 'pm', 'support', 'data', 'java', 'security', 'game', 'python', 'ux', 'c', 'javascript', 'devops', 'html'`
- lvl (strings separated by comma) from avaliable: `'junior', 'mid', 'senior', 'c-level'`
- city (string) - always looking for remote + eventually in the city of your choosing
```
justjoinit_settings = {
"role": ["testing"],
"lvl": ["mid", "junior"],
"lvl": "mid.senior",
"city": "Gdańsk"
}
```
Expand Down
64 changes: 64 additions & 0 deletions deprecated/justjoinit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import requests
from modules.base_logger import log
from modules.common import updateExcel

class JustJoinIt():
def __init__(self):
self.jobs_dict = {}

def updateJobsDict(self):
url = 'https://justjoin.it/api/offers'
try:
headers = {
"content-type": "application/json, text/plain",
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64; rv:57.0) "
"Gecko/20100101 Firefox/57.0"
),
"Host": "justjoin.it",
"Referer": "justjoin.it",
}
response = requests.get(url, headers=headers, timeout=120)
return response
except Exception as e:
print(f"Exception {e} on updateJobsDict.")
return None

def prepareJobsDict(self, response, role, lvl, city):
marker_list = []
city_list = []
exp_list = []

for offer_dict in response.json():
url = f'https://justjoin.it/offers/{offer_dict["id"]}'

if offer_dict.get("marker_icon") not in role:
continue
if offer_dict.get("experience_level") not in lvl:
continue
if (offer_dict.get("workplace_type") not in ("remote") and
not (offer_dict.get("workplace_type") not in ("remote") and offer_dict.get("city") in city)):
continue
if offer_dict.get("display_offer") is False:
continue

job_title = offer_dict.get("title")
job_company = offer_dict.get("company_name")
job_salary = offer_dict.get("employment_types")
job_location = offer_dict.get("city")

self.jobs_dict[url] = {"Title": [job_title],
"Company": [job_company],
"Salary": [job_salary],
"Location": [job_location]}
marker_list.append(offer_dict.get("marker_icon"))
city_list.append(offer_dict.get("city"))
exp_list.append(offer_dict.get("experience_level"))

def run(sheetname, role, lvl, city):
log.info("Starting JustJointIt scrapper.")
just = JustJoinIt()
resp = just.updateJobsDict()
just.prepareJobsDict(resp, role, lvl, city)
updateExcel(sheetname, just.jobs_dict)
log.info("Finished JustJoinIt scrapper.")
3 changes: 3 additions & 0 deletions modules/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from modules.base_logger import log
from unidecode import unidecode

now = datetime.now()

Expand Down Expand Up @@ -101,5 +102,7 @@ def createLinks(**kwargs):
generated_link = f"https://bulldogjob.pl/companies/jobs/s/role,{role}/experienceLevel,{lvl}/city,{city}"
elif site == "NoFluffJobs":
generated_link = f"https://nofluffjobs.com/pl/praca-zdalna/{role}?criteria=city%3D{city}%20%20seniority%3D{lvl}"
elif site == "JustjoinIt":
generated_link = f"https://justjoin.it/{unidecode(city).lower()}/{role}/experience-level_{lvl}/remote_yes"
log.info("Generated link: %s", generated_link)
return(generated_link)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ pandas==2.1.4; python_version > '3.8'
pytest==7.4.4
Requests==2.31.0
pytest-html==4.1.1
unidecode==1.3.7
14 changes: 8 additions & 6 deletions runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
}

justjoinit_settings = {
"role": ["testing"],
"lvl": ["mid", "junior"],
"site": "JustjoinIt",
"role": "testing",
"lvl": "mid.senior",
"city": "Gdańsk"
}

Expand All @@ -39,12 +40,13 @@
role=bulldogjob_settings['role'],
lvl=bulldogjob_settings['lvl'],
city=bulldogjob_settings['city'])
JUSTJOINIT_URL = common.createLinks(site=justjoinit_settings['site'],
role=justjoinit_settings['role'],
lvl=justjoinit_settings['lvl'],
city=justjoinit_settings['city'])

# Run setup and scrappers
setup.run(EXCEL_NAME, NOFLUFFJOBS_SHEET, BULLDOGJOB_SHEET, JUSTJOINIT_SHEET)
nofluffjobs.run(NOFLUFFJOBS_SHEET, NOFLUFFJOBS_URL)
bulldogjob.run(BULLDOGJOB_SHEET, BULLDOGJOB_URL)
# justjoinit.run(JUSTJOINIT_SHEET,
# role=justjoinit_settings['role'],
# lvl=justjoinit_settings['lvl'],
# city=justjoinit_settings['city'])
justjoinit.run(JUSTJOINIT_SHEET, JUSTJOINIT_URL)
27 changes: 18 additions & 9 deletions scrappers/bulldogjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,33 @@ def updateJobsDict(self, url):
for job in job_links_list:
try:
#workarount for false job objects
if not job.find('button', class_=re.compile("flex items-center w-full relative text-xs", re.I)):
if not job.find('div', class_=re.compile("flex flex-col items-center relative my-auto", re.I)):
continue

job_link = job.get('href')
job_title = job.find(name="h3",
class_="md:mb-5 lg:mb-0 text-18 font-extrabold leading-8 mr-8 md:mr-0")
if job_title is not None:

job_title = job.find(name="h3", class_=re.compile("md:mb-5 lg:mb-0 md:text-18 text", re.I))
if job_title.find(text=True, recursive=True) is not None:
job_title = job_title.find(text=True, recursive=False).text
else:
job_title="Sprawdź regex."
job_company = job.find('div', class_=re.compile("text-xxs uppercase", re.I)).text
job_title="Regex error."

job_company = job.find('div', class_=re.compile("text-xxs uppercase", re.I))
if job_company.find(text=True, recursive=True) is not None:
job_company = job_company.find(text=True, recursive=True) is not None
else:
job_company = "Regex error"

job_salary = job.find('div', class_=re.compile("lg:font-extrabold md:text-xl text-dm", re.I))
if job_salary.find(text=True, recursive=True) is not None:
if job_salary is not None:
job_salary = job_salary.find(text=True, recursive=True).text
else:
job_salary="Brak informacji"
job_salary="No information or regex error"

job_overall_info = job.find_all('div', class_=re.compile("flex items-start", re.I))
for info in job_overall_info:
text = text + info.find('span').text + " / "
text = text + info.find('span').text + " / "

self.jobs_dict[job_link] = {"Title": [job_title],
"Company": [job_company],
"Salary": [job_salary],
Expand Down
70 changes: 21 additions & 49 deletions scrappers/justjoinit.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,36 @@
import re
import requests
from bs4 import BeautifulSoup
from modules.base_logger import log
from modules.common import updateExcel
from modules.common import getDomainName, updateExcel

class JustJoinIt():
def __init__(self):
self.jobs_dict = {}

def updateJobsDict(self):
url = 'https://justjoin.it/api/offers'
def updateJobsDict(self, url):
domainName = getDomainName(url)
try:
headers = {
"content-type": "application/json, text/plain",
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64; rv:57.0) "
"Gecko/20100101 Firefox/57.0"
),
"Host": "justjoin.it",
"Referer": "justjoin.it",
}
response = requests.get(url, headers=headers, timeout=120)
return response
page = requests.get(url, timeout=120)
page_soup = BeautifulSoup(page.content, "html.parser")
job_links_list = page_soup.find_all("div", {"class": "css-1iq2gw3"})

for job in job_links_list:
job_link = "https://"+domainName+job.find('a', class_='css-4lqp8g')['href']
job_title = job.find('h2').text
job_company = job.find('div', class_=re.compile("css-ldh1c9", re.I)).text
job_salary = job.find('div', class_=re.compile("css-1b2ga3v", re.I)).text
job_location = job.find('div', class_=re.compile("css-68pppj", re.I)).text
self.jobs_dict[job_link] = {"Title": [job_title],
"Company": [job_company],
"Salary": [job_salary],
"Location": [job_location]}
except Exception as e:
print(f"Exception {e} on updateJobsDict.")
return None

def prepareJobsDict(self, response, role, lvl, city):
marker_list = []
city_list = []
exp_list = []

for offer_dict in response.json():
url = f'https://justjoin.it/offers/{offer_dict["id"]}'

if offer_dict.get("marker_icon") not in role:
continue
if offer_dict.get("experience_level") not in lvl:
continue
if (offer_dict.get("workplace_type") not in ("remote") and
not (offer_dict.get("workplace_type") not in ("remote") and offer_dict.get("city") in city)):
continue
if offer_dict.get("display_offer") is False:
continue

job_title = offer_dict.get("title")
job_company = offer_dict.get("company_name")
job_salary = offer_dict.get("employment_types")
job_location = offer_dict.get("city")

self.jobs_dict[url] = {"Title": [job_title],
"Company": [job_company],
"Salary": [job_salary],
"Location": [job_location]}
marker_list.append(offer_dict.get("marker_icon"))
city_list.append(offer_dict.get("city"))
exp_list.append(offer_dict.get("experience_level"))

def run(sheetname, role, lvl, city):
def run(sheetname, url):
log.info("Starting JustJointIt scrapper.")
just = JustJoinIt()
resp = just.updateJobsDict()
just.prepareJobsDict(resp, role, lvl, city)
just.updateJobsDict(url)
updateExcel(sheetname, just.jobs_dict)
log.info("Finished JustJoinIt scrapper.")
2 changes: 1 addition & 1 deletion scrappers/nofluffjobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def updateJobsDict(self, url):
for job in job_links_list:
job_link = "https://"+domainName+job['href']
job_title = job.find('h3').text
job_company = job.find('span', class_=re.compile("company", re.I)).text
job_company = job.find('h4').text
job_salary = job.find('span', class_=re.compile("badgy salary", re.I)).text
job_location = job.find('div', class_=re.compile("tw-flex tw-items-center ng-star-inserted", re.I)).text

Expand Down
6 changes: 5 additions & 1 deletion tests/test_createLinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,8 @@ def test_createLinks_BulldogJob():

def test_createLinks_NoFluffJobs():
assert createLinks(site='NoFluffJobs', role="testing", lvl="junior,mid", city="Gdańsk") == \
"https://nofluffjobs.com/pl/praca-zdalna/testing?criteria=city%3DGdańsk%20%20seniority%3Djunior,mid"
"https://nofluffjobs.com/pl/praca-zdalna/testing?criteria=city%3DGdańsk%20%20seniority%3Djunior,mid"

def test_createLinks_JustjoinIt():
assert createLinks(site="JustjoinIt", role="testing", lvl="mid.senior", city="Gdańsk") == \
"https://justjoin.it/gdansk/testing/experience-level_mid.senior/remote_yes"
8 changes: 4 additions & 4 deletions tests/test_linksReachable.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_is_BulldogJob_reachable():
response = requests.get(link, timeout=120, headers=headers)
assert response.status_code == 200

# def test_is_JustJoinIt_reachable():
# link = 'https://justjoin.it/api/offers'
# response = requests.get(link, timeout=120)
# assert response.status_code == 200
def test_is_JustJoinIt_reachable():
link = createLinks(site="JustjoinIt", role="testing", lvl="mid.senior", city="Gdańsk")
response = requests.get(link, timeout=120)
assert response.status_code == 200

0 comments on commit 351ce26

Please sign in to comment.