Skip to content

Commit

Permalink
Merge pull request #190 from dod-advana/hotfix/dod_corona_selector
Browse files Browse the repository at this point in the history
selector corrected
  • Loading branch information
takao8 committed Sep 7, 2023
2 parents 4ac4f0e + e821e6d commit 06cee66
Showing 1 changed file with 27 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest
import re
from urllib.parse import urlparse
import hashlib
import os

covid_re = re.compile(r'covid|covid\-19|coronavirus', flags=re.IGNORECASE)

Expand Down Expand Up @@ -32,16 +34,25 @@ def get_downloadable_item(self, href, base_url=None) -> dict:
"compression_type": None
}

# Doc naming convention, does not fit all cases yet
def validateDisplayDoc(self, title):
if "guidance" in title.lower():
return "Guidance"
return "Document"

def parse(self, response):
blocks = response.css('div.dgov-grid div.block')

for block in blocks:
category_text = self.ascii_clean(block.css('h2.cat::text').get())
items = block.css('div.stories div.item')
category_text = self.ascii_clean(block.css('h2.cat::text').get()) # Category names
items = block.css('div.common-grid div.item') # Corrected the selector


for item in items:
doc_title_raw = item.css('a.title::text').get()
doc_title = self.ascii_clean(doc_title_raw)
doc_title = self.ascii_clean(item.css('a.title::text').extract_first("").strip())


href_raw = item.css('a.title::attr(href)').get()
download_url = self.ensure_full_href_url(
Expand All @@ -58,19 +69,20 @@ def parse(self, response):

noted = " ".join(item.css('*.noted *::text').getall())
supp_downloadable_items = []
# add pub date to repeatedly issued listing
doc_title_without_date = f"{doc_title}" # Clean up display name, remove trailing date
if noted:
doc_title = f"{doc_title} - {publication_date}"

supplamental_hrefs = item.css(
'*.noted a::attr(href)').getall()
supp_downloadable_items = [
self.get_downloadable_item(href) for href in supplamental_hrefs
]

raw_doc_name = f"{category_text}: {doc_title}"
doc_name = f"{category_text}: {doc_title}"
display_doc_type = "Document" # Doc type for display on app
display_doc_type = self.validateDisplayDoc(self.doc_type) # Doc type for display on app
display_source = self.data_source + " - " + self.source_title
display_title = self.doc_type + ": " + doc_title
display_title = self.doc_type + " - " + doc_title_without_date
is_revoked = False
source_page_url = download_url
source_fqdn = urlparse(source_page_url).netloc
Expand All @@ -80,7 +92,8 @@ def parse(self, response):
"noted": noted,
"doc_name": doc_name,
"display_title": display_title,
"download_url": source_page_url
"download_url": source_page_url,
"display_doc_type": display_doc_type
}

version_hash = dict_to_sha256_hex_digest(version_hash_fields)
Expand Down Expand Up @@ -145,8 +158,8 @@ def parse_follow_page(self, response) -> typing.Union[DocItem, None]:
supp_downloadable_items = response.meta["supp_downloadable_items"]
doc_item["downloadable_items"] = []

href_finder = self.get_body_hrefs if len(
response.css('div.body')) else self.get_unknown_layout_hrefs
href_finder = self.get_body_hrefs if len(response.css('div.body')) else self.get_unknown_layout_hrefs


hrefs = self.filter_mailto_hrefs(href_finder(response))

Expand Down Expand Up @@ -180,4 +193,9 @@ def parse_follow_page(self, response) -> typing.Union[DocItem, None]:

doc_item["version_hash"] = dict_to_sha256_hex_digest(doc_item["version_hash_raw_data"])

self.logger.info(f"Parsing follow page for URL: {response.url}")
self.logger.info(f"Captured {len(hrefs)} hrefs from URL: {response.url}")

yield doc_item


0 comments on commit 06cee66

Please sign in to comment.