Merge pull request #237 from dod-advana/patch-stale-spiders

Patch tradoc and national_guard Spiders
dod-advana · Jul 16, 2024 · aa7da0c · aa7da0c
2 parents 4de0bdd + ebbb328
commit aa7da0c
Show file tree

Hide file tree

Showing 2 changed files with 152 additions and 127 deletions.
diff --git a/dataPipelines/gc_scrapy/gc_scrapy/spiders/chief_national_guard_bureau_spider.py b/dataPipelines/gc_scrapy/gc_scrapy/spiders/chief_national_guard_bureau_spider.py
@@ -1,66 +1,70 @@
+from urllib.parse import urlparse
+
 from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
 from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
-from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date
-from urllib.parse import urlparse
+from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest
+
 
 class CNGBISpider(GCSpider):
     """
-        Parser for Chief National Guard Bureau Instructions
+    Parser for Chief National Guard Bureau Instructions
     """
 
-    name = "National_Guard"  # Crawler name
-    display_org = "National Guard"  # Level 1: GC app 'Source' filter for docs from this crawler
-    data_source = "National Guard Bureau Publications & Forms Library"  # Level 2: GC app 'Source' metadata field for docs from this crawler
-    source_title = "Unlisted Source"  # Level 3 filter
+    # Crawler name
+    name = "National_Guard"
+    # Level 1: GC app 'Source' filter for docs from this crawler
+    display_org = "National Guard"
+    # Level 2: GC app 'Source' metadata field for docs from this crawler
+    data_source = "National Guard Bureau Publications & Forms Library"
+    # Level 3 filter
+    source_title = "Unlisted Source"
     display_source = data_source + " - " + source_title
 
-    allowed_domains = ['ngbpmc.ng.mil']
-    start_urls = [
-        'https://www.ngbpmc.ng.mil/publications1/cngbi/'
-    ]
+    allowed_domains = ["ngbpmc.ng.mil"]
+    start_urls = ["https://www.ngbpmc.ng.mil/Publications/CNGB-Instructions/"]
 
     file_type = "pdf"
     doc_type = "CNGBI"
     rotate_user_agent = True
 
     def parse(self, response):
-        rows = response.css('div.WordSection1 table tbody tr')
+        rows = response.css("div.WordSection1 table tbody tr")
         for row in rows:
-            href_raw = row.css('td:nth-child(1) a::attr(href)').get()
+            href_raw = row.css("td:nth-child(1) a::attr(href)").get()
 
-            if not href_raw.startswith('/'):
+            if not href_raw.startswith("/"):
                 cac_login_required = True
             else:
                 cac_login_required = False
 
             web_url = self.ensure_full_href_url(href_raw, self.start_urls[0])
 
             file_type = self.get_href_file_extension(href_raw)
-            web_url = web_url.replace(' ', '%20')
+            web_url = web_url.replace(" ", "%20")
             downloadable_items = [
                 {
                     "doc_type": file_type,
                     "download_url": web_url,
-                    "compression_type": None
+                    "compression_type": None,
                 }
             ]
 
-            # a lot of the docs have the space unicode \xa0 in them. replacing it before getting doc_num
-            doc_name_raw = row.css('td:nth-child(1) a::text')
+            # a lot of docs have the space unicode \xa0 which need replaced before getting doc_num
+            doc_name_raw = row.css("td:nth-child(1) a::text")
             if doc_name_raw:
-                doc_name_raw = doc_name_raw.get().replace(u'\xa0', ' ')
+                doc_name_raw = doc_name_raw.get().replace("\xa0", " ")
             else:
                 continue
 
-            doc_num_raw = doc_name_raw.replace('CNGBI ', '')
+            doc_num_raw = doc_name_raw.replace("CNGBI ", "")
 
-            publication_date = row.css('td:nth-child(2) span::text').get()
+            publication_date = row.css("td:nth-child(2) span::text").get()
 
-            doc_title_raw = row.css('td:nth-child(3) a::text').get()
+            doc_title_raw = row.css("td:nth-child(3) a::text").get()
             if doc_title_raw is None:
-                doc_title_raw = row.css('td:nth-child(3) span::text').get()
+                doc_title_raw = row.css("td:nth-child(3) span::text").get()
                 if doc_title_raw is None:
-                    doc_title_raw = row.css('td:nth-child(3) font::text').get()
+                    doc_title_raw = row.css("td:nth-child(3) font::text").get()
                     if doc_title_raw is None:
                         print("uh oh")
 
@@ -72,7 +76,7 @@ def parse(self, response):
             version_hash_fields = {
                 "item_currency": href_raw,
                 "document_title": doc_title,
-                "document_number": doc_num_raw
+                "document_number": doc_num_raw,
             }
             version_hash = dict_to_sha256_hex_digest(version_hash_fields)
 
@@ -99,4 +103,3 @@ def parse(self, response):
                 file_ext=self.file_type,
                 is_revoked=False,
             )
-