Merge pull request #178 from dod-advana/varunt/crawler_fixes

SASC and CFR changes
dod-advana · Jun 21, 2023 · acb50da · acb50da
2 parents 99374a0 + 3ac9583
commit acb50da
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 17 deletions.
diff --git a/dataPipelines/gc_scrapy/gc_scrapy/spiders/cfr_spider.py b/dataPipelines/gc_scrapy/gc_scrapy/spiders/cfr_spider.py
@@ -31,9 +31,6 @@ class CFRSpider(GCSpider):
         "x-requested-with": "XMLHttpRequest"
     }
 
-    def start_requests(self):
-        yield scrapy.Request(url=self.start_urls[0], method='GET', headers=self.headers)
-
     @staticmethod
     def get_pub_date(publication_date):
         '''
@@ -91,14 +88,6 @@ def get_package_ids(self, response):
             yield response.follow(url=detail_url, callback=self.parse_detail_data, meta={"offset": 0, "year": year},
                                   headers=self.headers)
 
-        # iterate offset
-        next_offset = current_offset + 1
-        next_offset_url = response.url.replace(
-            f'offset={current_offset}', f'offset={next_offset}')
-
-        yield response.follow(url=next_offset_url, callback=self.get_package_ids, meta={"offset": next_offset, "year": year},
-                              headers=self.headers)
-
     def parse_detail_data(self, response):
         data = json.loads(response.body)
         year = response.meta["year"]

diff --git a/dataPipelines/gc_scrapy/gc_scrapy/spiders/sasc_spider.py b/dataPipelines/gc_scrapy/gc_scrapy/spiders/sasc_spider.py
@@ -74,7 +74,7 @@ def parse_hearing_detail_page(self, response):
             # Get the hearing detail page as a document
 
             main = response.css("div.SiteLayout__main")
-            title_raw = self.ascii_clean(main.css("h1.Heading__title ::text").get().strip())
+            title_raw = self.ascii_clean(main.css("h1.Heading__title::text")[-1].get().strip())
             title = ' '.join(title_raw.split())
             date = main.css('div.Hearing__detail time::attr(datetime)').get()
             spaced_title = f" - {title}" if title else ""
@@ -149,9 +149,9 @@ def parse_hearing_detail_page(self, response):
                 for witdoc in witness_docs:
                     witness_href = witdoc.css('a::attr(href)').get()
                     if witness_href is not None:
-                        honorific = witblock.css('h4.Heading__title span:nth-child(1)::text').get()
-                        wit_name = witblock.css('h4.Heading__title span:nth-child(2)::text').get()
-                        member_name = witblock.css('h4.Heading__title ::text').get()
+                        honorific = witblock.css('h3.Heading__title span:nth-child(1)::text').get()
+                        wit_name = witblock.css('h3.Heading__title span:nth-child(2)::text').get()
+                        member_name = witblock.css('h3.Heading__title ::text').get()
 
                         if honorific and wit_name is not None:
                             full_name_raw = f"{honorific} {wit_name}"
@@ -217,7 +217,7 @@ def populate_doc_item(self, fields):
 
         display_doc_type = fields['display_doc_type'] # Doc type for display on app
         display_source = data_source + " - " + source_title
-        display_title = doc_type + " - " + doc_title # Different than other crawlers due to lack of doc_num; added a dash for clarity
+        display_title = doc_type + ": " + doc_title # Different than other crawlers due to lack of doc_num; added a dash for clarity
         is_revoked = False
         source_page_url = fields['source_page_url']
         source_fqdn = urlparse(source_page_url).netloc
@@ -233,7 +233,7 @@ def populate_doc_item(self, fields):
             "download_url": download_url,
             "display_title": display_title
         }
-
+        
         version_hash = dict_to_sha256_hex_digest(version_hash_fields)
 
         return DocItem(