Skip to content

Commit

Permalink
Merge pull request #196 from dod-advana/anthony/army_pubs_cac_gate
Browse files Browse the repository at this point in the history
refactored cac_required evaluation
  • Loading branch information
amaruca141 committed Oct 3, 2023
2 parents 61d0cd6 + 5b0ff12 commit aec1397
Showing 1 changed file with 19 additions and 8 deletions.
27 changes: 19 additions & 8 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/army_pubs_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,16 @@ def parse(self, response):
all_hrefs = response.css(
'li.usa-nav__primary-item')[2].css('a::attr(href)').getall() # Get all hyperlinks on page

cac_gated_hrefs = ['/ProductMaps/PubForm/EM.aspx', '/ProductMaps/PubForm/FT.aspx', '/ProductMaps/PubForm/LO.aspx',
'/ProductMaps/PubForm/MWO.aspx', '/ProductMaps/PubForm/SB.aspx', '/ProductMaps/PubForm/SC.aspx', '/ProductMaps/PubForm/TB.aspx',
'/ProductMaps/PubForm/TM_1_8.aspx', '/ProductMaps/PubForm/TM_9.aspx', '/ProductMaps/PubForm/TM_10.aspx', '/ProductMaps/PubForm/TM_11_4.aspx',
'/ProductMaps/PubForm/TM_11_5.aspx', '/ProductMaps/PubForm/TM_11_6_7.aspx', '/ProductMaps/PubForm/TM_14_750.aspx'] # All Links Under Technical & Equipment that require external link registration

links = [link for link in all_hrefs if link not in do_not_process] # Remove items in URL stop list from hyperlinks list

public_hrefs = [public for public in links if public not in cac_gated_hrefs] # After links filtering above, removes cac_gated_hrefs

# yield from response.follow_all(public_hrefs, self.parse_source_page) # Follow each link and call parse_source_page function for each; excluding cac_gated
yield from response.follow_all(links, self.parse_source_page) # Follow each link and call parse_source_page function for each

def parse_source_page(self, response):
Expand All @@ -51,9 +59,18 @@ def parse_source_page(self, response):
list of table links.
'''
table_links = response.css('table td a::attr(href)').extract() # Extract all links in the html table
yield from response.follow_all([self.pub_url+link for link in table_links], self.parse_detail_page) # Call parse_detail_page function for each link

def parse_detail_page(self, response):
# CAC Gate Eval
registration_required = response.xpath('//div//text()').getall() # Evaluates if 'registration is required' is in the source page
cac_login_required = False
for text in registration_required:
if 'registration is required' in text.lower():
cac_login_required = True
break

yield from response.follow_all([self.pub_url+link for link in table_links], self.parse_detail_page, cb_kwargs={'cac_login_required': cac_login_required}) # Call parse_detail_page function for each link and pass cac_login_required as an argument

def parse_detail_page(self, response, cac_login_required):
'''
This function generates a link and metadata for each document for use by bash download script.
'''
Expand All @@ -62,13 +79,7 @@ def parse_detail_page(self, response):
doc_num_raw = doc_name_raw.split()[-1] # Get numeric portion of document name as doc_num #### TODO: Sometimes this is Nonetype and causes an error
doc_type_raw = doc_name_raw.split()[0] # Get alphabetic portion of document name as doc_type
publication_date = response.xpath("//*[contains(text(), 'Pub/Form Date')]/following-sibling::node()[1]/text()").get() # Get document publication date
dist_stm = response.xpath("//*[contains(text(), 'Dist Restriction Code')]/following-sibling::node()[1]/text()").get() # Get document distribution statment (re: doc accessibility)
proponent = self.ascii_clean(response.xpath("//*[contains(text(), 'Pub/Form Proponent')]/following-sibling::node()[1]/text()").get()) # Get document "Proponent"
if dist_stm and (dist_stm.startswith("A") or dist_stm.startswith("N")):
cac_login_required = False # The distribution statement is either "A" or "Not Applicable", i.e. anyone can access
else:
cac_login_required = True # The distribution statement has more restrictions

linked_items = response.xpath("//*[contains(text(), 'Unit Of Issue(s)')]/following-sibling::node()[1]/a") # Get document link in row
downloadable_items = []

Expand Down

0 comments on commit aec1397

Please sign in to comment.