Skip to content

Commit

Permalink
Merge pull request #184 from dod-advana/varunt/air_force_crawler_exte…
Browse files Browse the repository at this point in the history
…nsions

Extending Air Force Crawler Capabilities
  • Loading branch information
vat99 committed Aug 25, 2023
2 parents b8a5e3b + b0dbb27 commit 4ac4f0e
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 23 deletions.
34 changes: 23 additions & 11 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/air_force_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,13 @@ class AirForcePubsSpider(GCSeleniumSpider):
'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=1', # AIR FORCE
'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=16', # AIR NATIONAL GUARD
'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=20', # UNITED STATES SPACE FORCE
# 'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=2' # MAJOR COMMANDS
'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=2', # MAJOR COMMANDS
'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=18', # LEAD COMMANDS
'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=3', # DRUs
'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=4', # FOAs
'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=5', # NUMBERED AIR FORCES
'https://www.e-publishing.af.mil/Product-Index/#/?view=cat&catID=7' # UNITS

] # URL where the spider begins crawling

file_type = "pdf" # Define filetype for the spider to identify.
Expand Down Expand Up @@ -80,14 +86,13 @@ def parse(self, response):
# if page_url.endswith('catID=2'): # Optional condition to pull AF Reserve Command docs from Major Commands section
# organizations = ['Air Force Reserve Command']

for org in organizations:
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.LINK_TEXT, org))))

for org in organizations:
try:
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.LINK_TEXT, org))))

all_pubs = WebDriverWait(driver, 5).until(
EC.visibility_of_element_located((By.LINK_TEXT, '00 ALL PUBLICATIONS')))

except:
driver.back()
print(f"Failed to find publications link for: {org} at {page_url}")
Expand All @@ -102,8 +107,13 @@ def parse(self, response):
for item in self.parse_table(driver):
yield item

last_page_raw = driver.find_element(By.CSS_SELECTOR, '#data_paginate > span > a:last-child')
last_page = int(last_page_raw.text)
try:
last_page_raw = driver.find_element(By.CSS_SELECTOR, '#data_paginate > span > a:last-child')
last_page = int(last_page_raw.text)
except:
driver.back()
print(f"Failed to find last page: {org} at {page_url}")
continue

while last_page > 1:
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 5).until(
Expand All @@ -126,8 +136,10 @@ def parse_table(self, driver):

## Iterate through each row in table get column values as metadata for each downloadable document
for row in webpage.css(row_selector):
product_number_raw = row.css(
f'td:nth-child(1) a::text').get(default='')
product_number_raw = row.xpath('td//text()')[0].extract()
## If the table contains no entries then skip
if product_number_raw == "No data available in table":
continue
web_url = row.css(
f'td:nth-child(1) a::attr(href)').get(default='')
title_raw = row.css(
Expand Down Expand Up @@ -209,7 +221,7 @@ def parse_table(self, driver):
or '-S' in prod_num else False

source_page_url = driver.current_url

fields = {
'doc_name': doc_name,
'doc_num': doc_num,
Expand Down
20 changes: 8 additions & 12 deletions docker/core/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -87,21 +87,17 @@ RUN curl -LfSo /tmp/awscliv2.zip "https://awscli.amazonaws.com/awscli-exe-linux-
&& /opt/aws/install

## Getting chrome browser
COPY ./docker/core/google-chrome.repo /etc/yum.repos.d/google-chrome.repo
RUN \
curl https://dl-ssl.google.com/linux/linux_signing_key.pub -o /tmp/google_key.pub \
&& rpm --import /tmp/google_key.pub \
&& rm /tmp/google_key.pub \
&& yum install google-chrome-stable -y \
&& yum clean all \
&& rm -rf /var/cache/yum
RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm -P /tmp/ \
&& yum install /tmp/google-chrome-stable_current_x86_64.rpm -y \
&& rm /tmp/google-chrome-stable_current_x86_64.rpm

## Getting chrome driver
RUN \
wget -O /tmp/chromedriver.zip \
https://chromedriver.storage.googleapis.com/$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE)/chromedriver_linux64.zip \
&& unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ \
&& rm /tmp/chromedriver.zip
wget -O /tmp/chromedriver.zip \
https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5845.96/linux64/chromedriver-linux64.zip \
&& unzip /tmp/chromedriver.zip chromedriver-linux64/chromedriver -d /tmp/ \
&& mv /tmp/chromedriver-linux64/chromedriver /usr/local/bin/ \
&& rm -rf /tmp/chromedriver*

#####
## ## Python packages
Expand Down

0 comments on commit 4ac4f0e

Please sign in to comment.