Fix JustjoinIT scrapper (#48)

* First update on justjoinit scrapper * Update runner * Fix nofluffjobs scrapper * Fix bulldogjob scrapper * Reenable tests
wkobiela · Jan 7, 2024 · 351ce26 · 351ce26
1 parent 9f50292
commit 351ce26
Show file tree

Hide file tree

Showing 10 changed files with 128 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,5 @@
 # jobScrapper
 
-### JustJoinIt scrapper temporarily disabled, API is unreachable - no further information, if it will be back.
-
 ## Description
 Simple python project, that should make it easier to be up to date with jobs offers. Websites like BulldogJob, Nofluffjobs or JustJoinIt have this nasty fature - job offers that are "refreshed" are bumped to the top of the page, so it is easy to get lost of track and even apply to the same job twice. 
 
@@ -66,13 +64,13 @@ bulldogjob_settings = {
 ### JustJoinIt
 
 To setup justnoinit scrapper, insert 3 MAIN parameters.
-- role (list of strings) from available: `'testing', 'net', 'architecture', 'ruby', 'php', 'mobile', 'other', 'analytics', 'erp', 'go', 'admin', 'scala', 'pm', 'support', 'data', 'java', 'security', 'game', 'python', 'ux', 'c', 'javascript', 'devops', 'html'`
-- lvl (list of strings) from avaliable: `'junior', 'mid', 'senior'`
+- role (single string) from available: `'testing', 'net', 'architecture', 'ruby', 'php', 'mobile', 'other', 'analytics', 'erp', 'go', 'admin', 'scala', 'pm', 'support', 'data', 'java', 'security', 'game', 'python', 'ux', 'c', 'javascript', 'devops', 'html'`
+- lvl (strings separated by comma) from avaliable: `'junior', 'mid', 'senior', 'c-level'`
 - city (string) - always looking for remote + eventually in the city of your choosing
 ```
 justjoinit_settings = {
     "role": ["testing"],
-    "lvl": ["mid", "junior"],
+    "lvl": "mid.senior",
     "city": "Gdańsk" 
 }
 ```

diff --git a/deprecated/justjoinit.py b/deprecated/justjoinit.py
@@ -0,0 +1,64 @@
+import requests
+from modules.base_logger import log
+from modules.common import updateExcel
+
+class JustJoinIt():
+    def __init__(self):
+        self.jobs_dict = {}      
+
+    def updateJobsDict(self):
+        url = 'https://justjoin.it/api/offers'
+        try:
+            headers = {
+                "content-type": "application/json, text/plain",
+                "User-Agent": (
+                    "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) "
+                    "Gecko/20100101 Firefox/57.0"
+                ),
+                "Host": "justjoin.it",
+                "Referer": "justjoin.it",
+            }
+            response = requests.get(url, headers=headers, timeout=120)
+            return response
+        except Exception as e:
+            print(f"Exception {e} on updateJobsDict.")
+            return None
+
+    def prepareJobsDict(self, response, role, lvl, city):
+        marker_list = []
+        city_list = []
+        exp_list = []
+
+        for offer_dict in response.json():
+            url = f'https://justjoin.it/offers/{offer_dict["id"]}'
+
+            if offer_dict.get("marker_icon") not in role:
+                continue
+            if offer_dict.get("experience_level") not in lvl:
+                continue
+            if (offer_dict.get("workplace_type") not in ("remote") and 
+                not (offer_dict.get("workplace_type") not in ("remote") and offer_dict.get("city") in city)):
+                continue
+            if offer_dict.get("display_offer") is False:
+                continue
+
+            job_title = offer_dict.get("title")
+            job_company = offer_dict.get("company_name")
+            job_salary = offer_dict.get("employment_types")
+            job_location = offer_dict.get("city")
+
+            self.jobs_dict[url] = {"Title": [job_title], 
+                                    "Company": [job_company], 
+                                    "Salary": [job_salary], 
+                                    "Location": [job_location]}
+            marker_list.append(offer_dict.get("marker_icon"))
+            city_list.append(offer_dict.get("city"))
+            exp_list.append(offer_dict.get("experience_level"))
+
+def run(sheetname, role, lvl, city):
+    log.info("Starting JustJointIt scrapper.")
+    just = JustJoinIt()
+    resp = just.updateJobsDict()
+    just.prepareJobsDict(resp, role, lvl, city)
+    updateExcel(sheetname, just.jobs_dict)
+    log.info("Finished JustJoinIt scrapper.")
diff --git a/modules/common.py b/modules/common.py
@@ -7,6 +7,7 @@
 from bs4 import BeautifulSoup
 from openpyxl import load_workbook
 from modules.base_logger import log
+from unidecode import unidecode
 
 now = datetime.now()
 
@@ -101,5 +102,7 @@ def createLinks(**kwargs):
         generated_link = f"https://bulldogjob.pl/companies/jobs/s/role,{role}/experienceLevel,{lvl}/city,{city}"
     elif site == "NoFluffJobs":
         generated_link = f"https://nofluffjobs.com/pl/praca-zdalna/{role}?criteria=city%3D{city}%20%20seniority%3D{lvl}"
+    elif site == "JustjoinIt":
+        generated_link = f"https://justjoin.it/{unidecode(city).lower()}/{role}/experience-level_{lvl}/remote_yes"
     log.info("Generated link: %s", generated_link)    
     return(generated_link)
diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ pandas==2.1.4; python_version > '3.8'
 pytest==7.4.4
 Requests==2.31.0
 pytest-html==4.1.1
+unidecode==1.3.7
diff --git a/runner.py b/runner.py
@@ -25,8 +25,9 @@
 }
 
 justjoinit_settings = {
-    "role": ["testing"],
-    "lvl": ["mid", "junior"],
+    "site": "JustjoinIt",
+    "role": "testing",
+    "lvl": "mid.senior",
     "city": "Gdańsk" 
 }
 
@@ -39,12 +40,13 @@
                                     role=bulldogjob_settings['role'], 
                                     lvl=bulldogjob_settings['lvl'], 
                                     city=bulldogjob_settings['city'])
+JUSTJOINIT_URL = common.createLinks(site=justjoinit_settings['site'],
+                                    role=justjoinit_settings['role'],
+                                    lvl=justjoinit_settings['lvl'],
+                                    city=justjoinit_settings['city'])
 
 # Run setup and scrappers
 setup.run(EXCEL_NAME, NOFLUFFJOBS_SHEET, BULLDOGJOB_SHEET, JUSTJOINIT_SHEET)
 nofluffjobs.run(NOFLUFFJOBS_SHEET, NOFLUFFJOBS_URL)
 bulldogjob.run(BULLDOGJOB_SHEET, BULLDOGJOB_URL)
-# justjoinit.run(JUSTJOINIT_SHEET, 
-#                 role=justjoinit_settings['role'], 
-#                 lvl=justjoinit_settings['lvl'], 
-#                 city=justjoinit_settings['city'])
+justjoinit.run(JUSTJOINIT_SHEET, JUSTJOINIT_URL)
diff --git a/scrappers/bulldogjob.py b/scrappers/bulldogjob.py
@@ -26,24 +26,33 @@ def updateJobsDict(self, url):
                 for job in job_links_list:
                     try:
                         #workarount for false job objects
-                        if not job.find('button', class_=re.compile("flex items-center w-full relative text-xs", re.I)): 
+                        if not job.find('div', class_=re.compile("flex flex-col items-center relative my-auto", re.I)): 
                             continue
+
                         job_link = job.get('href')
-                        job_title = job.find(name="h3", 
-                                            class_="md:mb-5 lg:mb-0 text-18 font-extrabold leading-8 mr-8 md:mr-0")
-                        if job_title is not None:
+
+                        job_title = job.find(name="h3", class_=re.compile("md:mb-5 lg:mb-0 md:text-18 text", re.I))
+                        if job_title.find(text=True, recursive=True) is not None:
                             job_title = job_title.find(text=True, recursive=False).text
                         else:
-                            job_title="Sprawdź regex."
-                        job_company = job.find('div', class_=re.compile("text-xxs uppercase", re.I)).text
+                            job_title="Regex error."
+
+                        job_company = job.find('div', class_=re.compile("text-xxs uppercase", re.I))
+                        if job_company.find(text=True, recursive=True) is not None:
+                            job_company = job_company.find(text=True, recursive=True) is not None
+                        else:
+                            job_company = "Regex error"
+
                         job_salary = job.find('div', class_=re.compile("lg:font-extrabold md:text-xl text-dm", re.I))
-                        if job_salary.find(text=True, recursive=True) is not None:
+                        if job_salary is not None:
                             job_salary = job_salary.find(text=True, recursive=True).text
                         else:
-                            job_salary="Brak informacji"
+                            job_salary="No information or regex error"
+
                         job_overall_info = job.find_all('div', class_=re.compile("flex items-start", re.I))
                         for info in job_overall_info:
-                            text = text + info.find('span').text + " / "                  
+                            text = text + info.find('span').text + " / "
+
                         self.jobs_dict[job_link] = {"Title": [job_title], 
                                                     "Company": [job_company], 
                                                     "Salary": [job_salary], 

diff --git a/scrappers/justjoinit.py b/scrappers/justjoinit.py
@@ -1,64 +1,36 @@
+import re
 import requests
+from bs4 import BeautifulSoup
 from modules.base_logger import log
-from modules.common import updateExcel
+from modules.common import getDomainName, updateExcel
 
 class JustJoinIt():
     def __init__(self):
         self.jobs_dict = {}      
 
-    def updateJobsDict(self):
-        url = 'https://justjoin.it/api/offers'
+    def updateJobsDict(self, url):
+        domainName = getDomainName(url)
         try:
-            headers = {
-                "content-type": "application/json, text/plain",
-                "User-Agent": (
-                    "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) "
-                    "Gecko/20100101 Firefox/57.0"
-                ),
-                "Host": "justjoin.it",
-                "Referer": "justjoin.it",
-            }
-            response = requests.get(url, headers=headers, timeout=120)
-            return response
+            page = requests.get(url, timeout=120)
+            page_soup = BeautifulSoup(page.content, "html.parser")
+            job_links_list = page_soup.find_all("div", {"class": "css-1iq2gw3"})
+
+            for job in job_links_list:
+                job_link = "https://"+domainName+job.find('a',  class_='css-4lqp8g')['href']
+                job_title = job.find('h2').text
+                job_company = job.find('div', class_=re.compile("css-ldh1c9", re.I)).text 
+                job_salary = job.find('div', class_=re.compile("css-1b2ga3v", re.I)).text
+                job_location = job.find('div', class_=re.compile("css-68pppj", re.I)).text
+                self.jobs_dict[job_link] = {"Title": [job_title], 
+                                            "Company": [job_company], 
+                                            "Salary": [job_salary], 
+                                            "Location": [job_location]}
         except Exception as e:
             print(f"Exception {e} on updateJobsDict.")
-            return None
-
-    def prepareJobsDict(self, response, role, lvl, city):
-        marker_list = []
-        city_list = []
-        exp_list = []
-
-        for offer_dict in response.json():
-            url = f'https://justjoin.it/offers/{offer_dict["id"]}'
-
-            if offer_dict.get("marker_icon") not in role:
-                continue
-            if offer_dict.get("experience_level") not in lvl:
-                continue
-            if (offer_dict.get("workplace_type") not in ("remote") and 
-                not (offer_dict.get("workplace_type") not in ("remote") and offer_dict.get("city") in city)):
-                continue
-            if offer_dict.get("display_offer") is False:
-                continue
-
-            job_title = offer_dict.get("title")
-            job_company = offer_dict.get("company_name")
-            job_salary = offer_dict.get("employment_types")
-            job_location = offer_dict.get("city")
-
-            self.jobs_dict[url] = {"Title": [job_title], 
-                                    "Company": [job_company], 
-                                    "Salary": [job_salary], 
-                                    "Location": [job_location]}
-            marker_list.append(offer_dict.get("marker_icon"))
-            city_list.append(offer_dict.get("city"))
-            exp_list.append(offer_dict.get("experience_level"))
 
-def run(sheetname, role, lvl, city):
+def run(sheetname, url):
     log.info("Starting JustJointIt scrapper.")
     just = JustJoinIt()
-    resp = just.updateJobsDict()
-    just.prepareJobsDict(resp, role, lvl, city)
+    just.updateJobsDict(url)
     updateExcel(sheetname, just.jobs_dict)
     log.info("Finished JustJoinIt scrapper.")
diff --git a/scrappers/nofluffjobs.py b/scrappers/nofluffjobs.py
@@ -23,7 +23,7 @@ def updateJobsDict(self, url):
                 for job in job_links_list:
                     job_link = "https://"+domainName+job['href']
                     job_title = job.find('h3').text
-                    job_company = job.find('span', class_=re.compile("company", re.I)).text 
+                    job_company = job.find('h4').text 
                     job_salary = job.find('span', class_=re.compile("badgy salary", re.I)).text
                     job_location = job.find('div', class_=re.compile("tw-flex tw-items-center ng-star-inserted", re.I)).text
 

diff --git a/tests/test_createLinks.py b/tests/test_createLinks.py
@@ -20,4 +20,8 @@ def test_createLinks_BulldogJob():
 
 def test_createLinks_NoFluffJobs():
     assert createLinks(site='NoFluffJobs', role="testing", lvl="junior,mid", city="Gdańsk") == \
-        "https://nofluffjobs.com/pl/praca-zdalna/testing?criteria=city%3DGdańsk%20%20seniority%3Djunior,mid"
+        "https://nofluffjobs.com/pl/praca-zdalna/testing?criteria=city%3DGdańsk%20%20seniority%3Djunior,mid"
+
+def test_createLinks_JustjoinIt():
+    assert createLinks(site="JustjoinIt", role="testing", lvl="mid.senior", city="Gdańsk") == \
+        "https://justjoin.it/gdansk/testing/experience-level_mid.senior/remote_yes"
diff --git a/tests/test_linksReachable.py b/tests/test_linksReachable.py
@@ -15,7 +15,7 @@ def test_is_BulldogJob_reachable():
     response = requests.get(link, timeout=120, headers=headers)
     assert response.status_code == 200
 
-# def test_is_JustJoinIt_reachable():
-#     link = 'https://justjoin.it/api/offers'
-#     response = requests.get(link, timeout=120)
-#     assert response.status_code == 200
+def test_is_JustJoinIt_reachable():
+    link = createLinks(site="JustjoinIt", role="testing", lvl="mid.senior", city="Gdańsk")
+    response = requests.get(link, timeout=120)
+    assert response.status_code == 200