Skip to content

Commit

Permalink
Merge pull request #180 from dod-advana/varunt/crawler_fixes
Browse files Browse the repository at this point in the history
DOD Corona Virus and MILPERSMAN fixes
  • Loading branch information
takao8 committed Jul 27, 2023
2 parents acb50da + 50d822e commit d6d366c
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 19 deletions.
8 changes: 6 additions & 2 deletions dataPipelines/gc_scrapy/gc_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,8 @@ def item_completed(self, results, item, info):
# Build a path to each file associated with an item:
if compression_type:
file_download_path = Path(self.output_dir, output_file_name).with_suffix(f".{compression_type}") # Path for downloaded zipped file
file_unzipped_path = Path(self.output_dir, output_file_name) # Path for unzipped files
file_unzipped_path = Path(self.output_dir, output_file_name)
# Path for unzipped files
metadata_download_path = f"{file_unzipped_path}.metadata" # Path for the accompanying metadata file
else:
# If it is a jbook crawler (and needs a different file output style)
Expand Down Expand Up @@ -267,7 +268,10 @@ def item_completed(self, results, item, info):

metadata_download_path = Path(self.output_dir, unzipped_item["doc_name"])
suffix_doc_type = f'{unzipped_item["downloadable_items"][0]["doc_type"]}'
metadata_download_path = metadata_download_path.with_suffix(f'.{suffix_doc_type}.metadata')

# when making metadata_download_path, need to add the previous suffix in case there are
# periods in filename. will mess up metadata names otherwise
metadata_download_path = metadata_download_path.with_suffix(metadata_download_path.suffix + f'.{suffix_doc_type}.metadata')

with open(metadata_download_path, "w") as f: # Write the metadata for each unzipped file
try:
Expand Down
61 changes: 46 additions & 15 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/dod_coronavirus_spider.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import typing
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest
import re
from urllib.parse import urlparse

covid_re = re.compile(r'covid|covid\-19|coronavirus', flags=re.IGNORECASE)

Expand All @@ -23,15 +25,14 @@ def get_downloadable_item(self, href, base_url=None) -> dict:
base_url = self.start_urls[0]

file_type = self.get_href_file_extension(href)
web_url = self.ensure_full_href_url(href, base_url)
download_url = self.ensure_full_href_url(href, base_url)
return {
"doc_type": file_type,
"web_url": web_url.replace(' ', '%20'),
"download_url": download_url.replace(' ', '%20'),
"compression_type": None
}

def parse(self, response):

blocks = response.css('div.dgov-grid div.block')

for block in blocks:
Expand All @@ -43,7 +44,7 @@ def parse(self, response):
doc_title = self.ascii_clean(doc_title_raw)

href_raw = item.css('a.title::attr(href)').get()
web_url = self.ensure_full_href_url(
download_url = self.ensure_full_href_url(
href_raw, self.start_urls[0])

(file_type, has_ext) = self.get_href_file_extension_does_exist(
Expand All @@ -66,32 +67,60 @@ def parse(self, response):
self.get_downloadable_item(href) for href in supplamental_hrefs
]

doc_name = f"{category_text}: {doc_title}"
display_doc_type = "Document" # Doc type for display on app
display_source = self.data_source + " - " + self.source_title
display_title = self.doc_type + ": " + doc_title
is_revoked = False
source_page_url = download_url
source_fqdn = urlparse(source_page_url).netloc

version_hash_fields = {
"publication_date": publication_date,
"noted": noted
"noted": noted,
"doc_name": doc_name,
"display_title": display_title,
"download_url": source_page_url
}

version_hash = dict_to_sha256_hex_digest(version_hash_fields)

doc_name = f"{category_text}: {doc_title}"
item = DocItem(
doc_name=doc_name,
doc_title=doc_title,
publication_date=publication_date,
version_hash_raw_data=version_hash_fields,
doc_name = doc_name,
doc_title = doc_title,
doc_type = self.doc_type,
display_doc_type = display_doc_type, #
cac_login_required = self.cac_login_required,
crawler_used = self.name,
source_page_url = source_page_url, #
source_fqdn = source_fqdn, #
version_hash_raw_data = version_hash_fields, #
display_org = self.display_org, #
data_source = self.data_source, #
source_title = self.source_title, #
display_source = display_source, #
display_title = display_title, #
file_ext = self.doc_type, #
is_revoked = is_revoked, #
version_hash = version_hash,
download_url=source_page_url,
doc_num = "None",
)

# some are downloadable items straight from start url
if has_ext:
item["downloadable_items"] = [
{
"doc_type": file_type,
"web_url": web_url.replace(' ', '%20'),
"download_url": download_url.replace(' ', '%20'),
"compression_type": None
}
]
item["downloadable_items"] + supp_downloadable_items

item["version_hash_raw_data"].update({
"item_currency": item["downloadable_items"][0]["web_url"],
"item_currency": item["downloadable_items"][0]["download_url"],
})

yield item
Expand Down Expand Up @@ -123,12 +152,12 @@ def parse_follow_page(self, response) -> typing.Union[DocItem, None]:

for href in hrefs:
(file_type, has_ext) = self.get_href_file_extension_does_exist(href)
web_url = self.ensure_full_href_url(href, self.start_urls[0])
download_url = self.ensure_full_href_url(href, self.start_urls[0])
if has_ext:
doc_item["downloadable_items"].append(
{
"doc_type": file_type,
"web_url": web_url.replace(' ', '%20'),
"download_url": download_url.replace(' ', '%20'),
"compression_type": None
}
)
Expand All @@ -138,15 +167,17 @@ def parse_follow_page(self, response) -> typing.Union[DocItem, None]:
doc_item["downloadable_items"] = [
{
"doc_type": 'html',
"web_url": response.url.replace(' ', '%20'),
"download_url": response.url.replace(' ', '%20'),
"compression_type": None
}
]

doc_item["downloadable_items"] + supp_downloadable_items

doc_item["version_hash_raw_data"].update({
"item_currency": doc_item["downloadable_items"][0]["web_url"],
"item_currency": doc_item["downloadable_items"][0]["download_url"],
})

doc_item["version_hash"] = dict_to_sha256_hex_digest(doc_item["version_hash_raw_data"])

yield doc_item
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def parse_page(self, response):
'doc_name': doc_name,
'doc_num': doc_num,
'doc_title': doc_title,
'doc_type': file_type,
'doc_type': self.doc_type,
'cac_login_required': False,
'source_page_url':current_url,
'download_url': download_url,
Expand Down Expand Up @@ -149,7 +149,8 @@ def populate_doc_item(self, fields):
"doc_num": doc_num,
#"publication_date": publication_date,
"download_url": download_url,
"display_title": display_title
"display_title": display_title,
"doc_type": doc_type,
}

version_hash = dict_to_sha256_hex_digest(version_hash_fields)
Expand Down

0 comments on commit d6d366c

Please sign in to comment.