Merge pull request #184 from dod-advana/varunt/air_force_crawler_exte…

…nsions Extending Air Force Crawler Capabilities
dod-advana · Aug 25, 2023 · 4ac4f0e · 4ac4f0e
2 parents b8a5e3b + b0dbb27
commit 4ac4f0e
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 23 deletions.
diff --git a/dataPipelines/gc_scrapy/gc_scrapy/spiders/air_force_spider.py b/dataPipelines/gc_scrapy/gc_scrapy/spiders/air_force_spider.py
@@ -41,7 +41,13 @@ class AirForcePubsSpider(GCSeleniumSpider):
         'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=1', # AIR FORCE
         'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=16', # AIR NATIONAL GUARD
         'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=20', # UNITED STATES SPACE FORCE
-        # 'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=2' # MAJOR COMMANDS
+        'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=2', # MAJOR COMMANDS
+        'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=18', # LEAD COMMANDS
+        'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=3', # DRUs
+        'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=4', # FOAs
+        'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=5', # NUMBERED AIR FORCES
+        'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=7' # UNITS
+
     ] # URL where the spider begins crawling
 
     file_type = "pdf" # Define filetype for the spider to identify.
@@ -80,14 +86,13 @@ def parse(self, response):
             # if page_url.endswith('catID=2'):  # Optional condition to pull AF Reserve Command docs from Major Commands section
             #     organizations = ['Air Force Reserve Command'] 
 
-            for org in organizations:                
-                driver.execute_script("arguments[0].click();", WebDriverWait(driver, 10).until(
-                    EC.element_to_be_clickable((By.LINK_TEXT, org))))
-
+            for org in organizations:
                 try:
+                    driver.execute_script("arguments[0].click();", WebDriverWait(driver, 10).until(
+                        EC.element_to_be_clickable((By.LINK_TEXT, org))))
+
                     all_pubs = WebDriverWait(driver, 5).until(
                         EC.visibility_of_element_located((By.LINK_TEXT, '00   ALL PUBLICATIONS')))
-
                 except:
                     driver.back()
                     print(f"Failed to find publications link for: {org} at {page_url}")
@@ -102,8 +107,13 @@ def parse(self, response):
                 for item in self.parse_table(driver):
                     yield item
 
-                last_page_raw = driver.find_element(By.CSS_SELECTOR, '#data_paginate > span > a:last-child')
-                last_page = int(last_page_raw.text)
+                try:
+                    last_page_raw = driver.find_element(By.CSS_SELECTOR, '#data_paginate > span > a:last-child')
+                    last_page = int(last_page_raw.text)
+                except:
+                    driver.back()
+                    print(f"Failed to find last page: {org} at {page_url}")
+                    continue
 
                 while last_page > 1:
                     driver.execute_script("arguments[0].click();", WebDriverWait(driver, 5).until(
@@ -126,8 +136,10 @@ def parse_table(self, driver):
 
         ## Iterate through each row in table get column values as metadata for each downloadable document
         for row in webpage.css(row_selector):
-            product_number_raw = row.css(
-                f'td:nth-child(1) a::text').get(default='')
+            product_number_raw = row.xpath('td//text()')[0].extract()
+            ## If the table contains no entries then skip
+            if product_number_raw == "No data available in table":
+                continue
             web_url = row.css(
                 f'td:nth-child(1) a::attr(href)').get(default='')
             title_raw = row.css(
@@ -209,7 +221,7 @@ def parse_table(self, driver):
                 or '-S' in prod_num else False
 
             source_page_url = driver.current_url
-            
+
             fields = {
                 'doc_name': doc_name,
                 'doc_num': doc_num,

diff --git a/docker/core/Dockerfile b/docker/core/Dockerfile
@@ -87,21 +87,17 @@ RUN curl -LfSo /tmp/awscliv2.zip "https://awscli.amazonaws.com/awscli-exe-linux-
     && /opt/aws/install
 
 ## Getting chrome browser
-COPY ./docker/core/google-chrome.repo /etc/yum.repos.d/google-chrome.repo
-RUN \
-    curl https://dl-ssl.google.com/linux/linux_signing_key.pub -o /tmp/google_key.pub \
-        && rpm --import /tmp/google_key.pub \
-        && rm /tmp/google_key.pub \
-    && yum install google-chrome-stable -y \
-    && yum clean all \
-    && rm -rf /var/cache/yum
+RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm -P /tmp/ \
+    && yum install /tmp/google-chrome-stable_current_x86_64.rpm -y \
+    && rm /tmp/google-chrome-stable_current_x86_64.rpm
 
 ## Getting chrome driver
 RUN \
-    wget -O /tmp/chromedriver.zip \
-        https://chromedriver.storage.googleapis.com/$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE)/chromedriver_linux64.zip \
-    && unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ \
-    && rm /tmp/chromedriver.zip
+     wget -O /tmp/chromedriver.zip \
+         https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5845.96/linux64/chromedriver-linux64.zip \
+     && unzip /tmp/chromedriver.zip chromedriver-linux64/chromedriver -d /tmp/ \
+     && mv /tmp/chromedriver-linux64/chromedriver /usr/local/bin/ \
+     && rm -rf /tmp/chromedriver*
 
 #####
 ## ## Python packages