Merge pull request #180 from dod-advana/varunt/crawler_fixes

DOD Corona Virus and MILPERSMAN fixes
dod-advana · Jul 27, 2023 · d6d366c · d6d366c
2 parents acb50da + 50d822e
commit d6d366c
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 19 deletions.
diff --git a/dataPipelines/gc_scrapy/gc_scrapy/pipelines.py b/dataPipelines/gc_scrapy/gc_scrapy/pipelines.py
@@ -234,7 +234,8 @@ def item_completed(self, results, item, info):
                 # Build a path to each file associated with an item:
                 if compression_type:
                     file_download_path = Path(self.output_dir, output_file_name).with_suffix(f".{compression_type}") # Path for downloaded zipped file
-                    file_unzipped_path = Path(self.output_dir, output_file_name) # Path for unzipped files
+                    file_unzipped_path = Path(self.output_dir, output_file_name)
+                    # Path for unzipped files
                     metadata_download_path = f"{file_unzipped_path}.metadata" # Path for the accompanying metadata file
                 else:
                     # If it is a jbook crawler (and needs a different file output style)
@@ -267,7 +268,10 @@ def item_completed(self, results, item, info):
 
                                 metadata_download_path = Path(self.output_dir, unzipped_item["doc_name"])
                                 suffix_doc_type = f'{unzipped_item["downloadable_items"][0]["doc_type"]}'
-                                metadata_download_path = metadata_download_path.with_suffix(f'.{suffix_doc_type}.metadata')
+
+                                # when making metadata_download_path, need to add the previous suffix in case there are
+                                # periods in filename. will mess up metadata names otherwise
+                                metadata_download_path = metadata_download_path.with_suffix(metadata_download_path.suffix + f'.{suffix_doc_type}.metadata')
 
                                 with open(metadata_download_path, "w") as f: # Write the metadata for each unzipped file
                                     try:

diff --git a/dataPipelines/gc_scrapy/gc_scrapy/spiders/dod_coronavirus_spider.py b/dataPipelines/gc_scrapy/gc_scrapy/spiders/dod_coronavirus_spider.py
@@ -1,7 +1,9 @@
 import typing
 from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
 from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
+from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest
 import re
+from urllib.parse import urlparse
 
 covid_re = re.compile(r'covid|covid\-19|coronavirus', flags=re.IGNORECASE)
 
@@ -23,15 +25,14 @@ def get_downloadable_item(self, href, base_url=None) -> dict:
             base_url = self.start_urls[0]
 
         file_type = self.get_href_file_extension(href)
-        web_url = self.ensure_full_href_url(href, base_url)
+        download_url = self.ensure_full_href_url(href, base_url)
         return {
             "doc_type": file_type,
-            "web_url": web_url.replace(' ', '%20'),
+            "download_url": download_url.replace(' ', '%20'),
             "compression_type": None
         }
 
     def parse(self, response):
-
         blocks = response.css('div.dgov-grid div.block')
 
         for block in blocks:
@@ -43,7 +44,7 @@ def parse(self, response):
                 doc_title = self.ascii_clean(doc_title_raw)
 
                 href_raw = item.css('a.title::attr(href)').get()
-                web_url = self.ensure_full_href_url(
+                download_url = self.ensure_full_href_url(
                     href_raw, self.start_urls[0])
 
                 (file_type, has_ext) = self.get_href_file_extension_does_exist(
@@ -66,32 +67,60 @@ def parse(self, response):
                         self.get_downloadable_item(href) for href in supplamental_hrefs
                     ]
 
+                doc_name = f"{category_text}: {doc_title}"
+                display_doc_type = "Document" # Doc type for display on app
+                display_source = self.data_source + " - " + self.source_title
+                display_title = self.doc_type + ": " + doc_title
+                is_revoked = False
+                source_page_url = download_url
+                source_fqdn = urlparse(source_page_url).netloc
+
                 version_hash_fields = {
                     "publication_date": publication_date,
-                    "noted": noted
+                    "noted": noted,
+                    "doc_name": doc_name,
+                    "display_title": display_title,
+                    "download_url": source_page_url
                 }
+
+                version_hash = dict_to_sha256_hex_digest(version_hash_fields)
 
-                doc_name = f"{category_text}: {doc_title}"
                 item = DocItem(
-                    doc_name=doc_name,
-                    doc_title=doc_title,
                     publication_date=publication_date,
-                    version_hash_raw_data=version_hash_fields,
+                    doc_name = doc_name,
+                    doc_title = doc_title,
+                    doc_type = self.doc_type,
+                    display_doc_type = display_doc_type, #
+                    cac_login_required = self.cac_login_required,
+                    crawler_used = self.name,
+                    source_page_url = source_page_url, #
+                    source_fqdn = source_fqdn, #
+                    version_hash_raw_data = version_hash_fields, #
+                    display_org = self.display_org, #
+                    data_source = self.data_source, #
+                    source_title = self.source_title, #
+                    display_source = display_source, #
+                    display_title = display_title, #
+                    file_ext = self.doc_type, #
+                    is_revoked = is_revoked, #
+                    version_hash = version_hash,
+                    download_url=source_page_url,
+                    doc_num = "None",
                 )
 
                 # some are downloadable items straight from start url
                 if has_ext:
                     item["downloadable_items"] = [
                         {
                             "doc_type": file_type,
-                            "web_url": web_url.replace(' ', '%20'),
+                            "download_url": download_url.replace(' ', '%20'),
                             "compression_type": None
                         }
                     ]
                     item["downloadable_items"] + supp_downloadable_items
 
                     item["version_hash_raw_data"].update({
-                        "item_currency": item["downloadable_items"][0]["web_url"],
+                        "item_currency": item["downloadable_items"][0]["download_url"],
                     })
 
                     yield item
@@ -123,12 +152,12 @@ def parse_follow_page(self, response) -> typing.Union[DocItem, None]:
 
         for href in hrefs:
             (file_type, has_ext) = self.get_href_file_extension_does_exist(href)
-            web_url = self.ensure_full_href_url(href, self.start_urls[0])
+            download_url = self.ensure_full_href_url(href, self.start_urls[0])
             if has_ext:
                 doc_item["downloadable_items"].append(
                     {
                         "doc_type": file_type,
-                        "web_url": web_url.replace(' ', '%20'),
+                        "download_url": download_url.replace(' ', '%20'),
                         "compression_type": None
                     }
                 )
@@ -138,15 +167,17 @@ def parse_follow_page(self, response) -> typing.Union[DocItem, None]:
             doc_item["downloadable_items"] = [
                 {
                     "doc_type": 'html',
-                    "web_url": response.url.replace(' ', '%20'),
+                    "download_url": response.url.replace(' ', '%20'),
                     "compression_type": None
                 }
             ]
 
             doc_item["downloadable_items"] + supp_downloadable_items
 
         doc_item["version_hash_raw_data"].update({
-            "item_currency": doc_item["downloadable_items"][0]["web_url"],
+            "item_currency": doc_item["downloadable_items"][0]["download_url"],
         })
 
+        doc_item["version_hash"] = dict_to_sha256_hex_digest(doc_item["version_hash_raw_data"])
+
         yield doc_item
diff --git a/dataPipelines/gc_scrapy/gc_scrapy/spiders/milpersman_spider.py b/dataPipelines/gc_scrapy/gc_scrapy/spiders/milpersman_spider.py
@@ -103,7 +103,7 @@ def parse_page(self, response):
                 'doc_name': doc_name,
                 'doc_num': doc_num,
                 'doc_title': doc_title,
-                'doc_type': file_type,
+                'doc_type': self.doc_type,
                 'cac_login_required': False,
                 'source_page_url':current_url,
                 'download_url': download_url,
@@ -149,7 +149,8 @@ def populate_doc_item(self, fields):
             "doc_num": doc_num,
             #"publication_date": publication_date,
             "download_url": download_url,
-            "display_title": display_title
+            "display_title": display_title,
+            "doc_type": doc_type,
         }
 
         version_hash = dict_to_sha256_hex_digest(version_hash_fields)