Merge pull request #235 from dod-advana/patch-hasc-crawler

Patch HASC Crawler
dod-advana · Jun 26, 2024 · 71d7878 · 71d7878
2 parents df5e863 + 9321fb2
commit 71d7878
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 152 deletions.
diff --git a/dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py b/dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
@@ -53,6 +53,10 @@ def set_version_hash_field(self, key: str, value: str) -> dict:
         """Sets a new field or updates an old one in the dict used for hashing"""
         self.hash_fields[key] = value
 
+    def remove_version_hash_field(self, key: str) -> None:
+        """Removes a field from tthe dict used for hashing"""
+        self.hash_fields.pop(key)
+
     def set_display_name(self, name: str) -> None:
         """Update display name for DocItemFields instance"""
         self.display_title = name

diff --git a/dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py b/dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py
@@ -1,190 +1,138 @@
-from calendar import day_abbr
-from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
-from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
+from typing import Any, Generator
+from urllib.parse import urljoin
+from datetime import datetime
 import scrapy
-import typing as t
+from scrapy.http import Response
 
-from urllib.parse import urljoin, urlparse
-from datetime import datetime
-from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date
+from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
+from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
+from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields
 
-# only scrape witness statements
-# 193 documents
 
 class HASCSpider(GCSpider):
-    name = "HASC" # Crawler name
-
+    """
+    As of 06/24/2024
+    crawls https://armedservices.house.gov/committee-activity/hearings/all for 179 pdfs (doc_type = Witness Statement)
+    """
+
+    # Crawler name
+    name = "HASC"
+    # Level 1: GC app 'Source' filter for docs from this crawler
+    display_org = "Congress"
+    # Level 2: GC app 'Source' metadata field for docs from this crawler
+    data_source = "House Armed Services Committee Publications"
+    # Level 3 filter
+    source_title = "House Armed Services Committee"
+
+    allowed_domains = ["armedservices.house.gov"]
     base_url = "https://armedservices.house.gov"
+    start_urls = [f"{base_url}/committee-activity/hearings/all?page=0"]
 
-    start_urls = [base_url]
-
-    randomly_delay_request = True
     rotate_user_agent = True
-
+    randomly_delay_request = True
     custom_settings = {
         **GCSpider.custom_settings,
-        "AUTOTHROTTLE_ENABLED": True, 
+        "AUTOTHROTTLE_ENABLED": True,
         "AUTOTHROTTLE_START_DELAY": 10,
         "AUTOTHROTTLE_MAX_DELAY": 60,
         "CONCURRENT_REQUESTS_PER_DOMAIN": 1,
     }
-
-    def parse(self, _):
-        pages_parser_map = [
-            (f"{self.base_url}/hearings", self.recursive_parse_hearings),
-            # (f"{self.base_url}/legislation",) # setup for future crawlers if needed
-        ]
-
-        for page_url, parser_func in pages_parser_map:
-            yield scrapy.Request(page_url, callback=parser_func)
 
     @staticmethod
-    def get_next_relative_url(response):
-        return response.css("li.pager-next > a::attr(href)").get()
-
-    def recursive_parse_hearings(self, response):
-
-        yield from self.parse_hearings_table_page(response)
-
-        next_relative_url = self.get_next_relative_url(response)
-        if next_relative_url:
-            next_url = f"{self.base_url}{next_relative_url}"
-            yield scrapy.Request(url=next_url, callback=self.recursive_parse_hearings)
-
-    def parse_hearings_table_page(self, response):
-
-        rows = response.css(
-            "div.view-content div")
+    def extract_doc_name_from_url(url: str) -> str:
+        """Returns a doc name given a full URL"""
+        doc_name = url.split("/")[-1]
+        return (
+            doc_name.replace(".pdf", "")
+            .replace("%", "_")
+            .replace(".", "")
+            .replace("-", "")
+        )
+
+    def parse(self, response: Response) -> Generator[DocItem, Any, None]:
+        """Recursively parses doc items out of House Armed Services Committee site"""
+        rows = response.css(".evo-views-row")
 
         for row in rows:
             try:
-                link = row.css("h3.field-content a::attr(href)").get()
-
+                link = row.css("div.h3.mt-0.font-weight-bold a::attr(href)").get()
                 if not link:
                     continue
 
                 follow_link = f"{self.base_url}{link}"
-                yield scrapy.Request(url=follow_link, callback=self.parse_hearing_detail_page)
+                yield scrapy.Request(url=follow_link, callback=self.parse_hearing_page)
             except Exception as e:
                 print(e)
 
-    def extract_doc_name_from_url(self, url):
-        doc_name =  url.split('/')[-1]
-        doc_name = doc_name.replace('.pdf', '').replace('%', '_').replace('.', '').replace('-', '')
-        return doc_name
+        # If data was found in this table then check the next page
+        if len(rows) > 0:
+            current_page_id = int(response.url[-1])
+            next_url = f"{response.url[0:-1]}{current_page_id+1}"
+            yield scrapy.Request(url=next_url, callback=self.parse)
 
-    def parse_hearing_detail_page(self, response):
+    def parse_hearing_page(self, response: Response) -> Generator[DocItem, Any, None]:
+        """Parses all statements available given a hearing details page"""
         try:
             # Get the basic details like title and date from the page
-            title = self.ascii_clean(response.css("#page-title ::text").get())
-            date_el = response.css("span.date-display-single ::text").get()
-            date_split = date_el.split()
-            month, day, year = date_split[1], date_split[2], date_split[3]
-            date = f"{month} {day} {year}"
+            doc_title = self.ascii_clean(response.css("h1.display-4 ::text").get())
+            publication_date = datetime.strptime(
+                response.css("time ::text").get(), "%a, %m/%d/%Y - %I:%M %p"
+            )
             doc_type = "Witness Statement"
 
             # Extract names of speakers
-            names = response.css('b ::text').getall()
+            speaker_names = response.css("b ::text").getall()
 
-            # Find all <a> tags within <p> tags and check if they contain the word "statement" and point to a PDF
-            links = response.css("p a")
-            for link in links:
+            # Find all links and check if they contain the word "statement" and point to a PDF
+            for link in response.css("p a"):
                 href = link.css("::attr(href)").get()
-                link_text = link.css("::text").get("").lower() # Get the text and convert it to lower case for comparison
-
-                # Check if "statement" is in the link text and the href ends with ".pdf"
-                if "statement" in link_text and href and href.endswith(".pdf"):
-                    # Check if any of the speaker names is in the link text
-                    for name in names:
-                        if name.lower() in link_text:
-                            follow_link = urljoin(self.base_url, href)
-                            display_title = self.ascii_clean(f"HASC {title} - {name}")
-                            doc_name = self.extract_doc_name_from_url(follow_link)
-
-                            # Set up the fields with the new PDF URL
-                            fields = {
-                                'doc_name': doc_name,
-                                'doc_num': ' ',  # No doc num for this crawler
-                                'doc_title': title,
-                                'doc_type': doc_type,
-                                'cac_login_required': False,
-                                'source_page_url': response.url,
-                                'download_url': follow_link,
-                                'publication_date': date,
-                                'file_ext': 'pdf', # Set to return pdf NOT html
-                                'display_title': display_title
-                            }
-                            # Instantiate DocItem class and assign document's metadata values
-                            doc_item = self.populate_doc_item(fields)
+                if not href or not href.endswith(".pdf"):
+                    continue
+
+                # Get the text and convert it to lower case for comparison
+                link_text = link.css("::text").get("").lower()
+                if "statement" not in link_text:
+                    continue
 
-                            yield doc_item
+                # Check if any of the speaker names is in the link text
+                for speaker_name in speaker_names:
+                    if speaker_name.lower() not in link_text:
+                        continue
+
+                    follow_link = urljoin(self.base_url, href)
+                    display_title = self.ascii_clean(f"HASC {doc_title} - {speaker_name}")
+                    doc_name = self.extract_doc_name_from_url(follow_link)
+
+                    fields = DocItemFields(
+                        doc_name=doc_name,
+                        doc_title=doc_title,
+                        doc_num=" ",
+                        doc_type=doc_type,
+                        publication_date=publication_date,
+                        cac_login_required=False,
+                        source_page_url=response.url,
+                        downloadable_items=[
+                            {
+                                "doc_type": "pdf",
+                                "download_url": follow_link,
+                                "compression_type": None,
+                            }
+                        ],
+                        download_url=follow_link,
+                        file_ext="pdf",
+                        display_doc_type=doc_type,
+                    )
+                    # Match fields to previous crawler iterations
+                    fields.remove_version_hash_field("doc_num")
+                    fields.set_version_hash_field("doc_title", doc_title)
+                    fields.set_display_name(display_title)
+
+                    yield fields.populate_doc_item(
+                        display_org=self.display_org,
+                        data_source=self.data_source,
+                        source_title=self.source_title,
+                        crawler_used=self.name,
+                    )
 
         except Exception as e:
             print(e)
-
-
-    def populate_doc_item(self, fields):
-        # '''
-        # This functions provides both hardcoded and computed values for the variables
-        # in the imported DocItem object and returns the populated metadata object
-        # '''
-        display_org = "Congress" # Level 1: GC app 'Source' filter for docs from this crawler
-        data_source = "House Armed Services Committee Publications" # Level 2: GC app 'Source' metadata field for docs from this crawler
-        source_title = "House Armed Services Committee" # Level 3 filter
-
-        doc_name = fields['doc_name']
-        doc_num = fields['doc_num']
-        doc_title = fields['doc_title']
-        doc_type = fields['doc_type']
-        cac_login_required = fields['cac_login_required']
-        download_url = fields['download_url']
-        publication_date = get_pub_date(fields['publication_date'])
-
-        display_doc_type = fields['doc_type'] # Doc type for display on app
-        display_source = data_source + " - " + source_title
-        display_title = fields['display_title']
-        is_revoked = False
-        source_page_url = fields['source_page_url']
-        source_fqdn = urlparse(source_page_url).netloc
-
-        downloadable_items = [{
-                "doc_type": fields['file_ext'],
-                "download_url": download_url,
-                "compression_type": None,
-            }]
-        file_ext = fields['file_ext'] # Set to return pdf NOT html
-
-        ## Assign fields that will be used for versioning
-        version_hash_fields = {
-            "doc_name":doc_name,
-            "doc_title": fields['doc_title'],
-            "publication_date": publication_date,
-            "download_url": download_url,
-            "display_title": display_title
-        }
-
-        version_hash = dict_to_sha256_hex_digest(version_hash_fields)
-
-        return DocItem(
-                    doc_name = doc_name,
-                    doc_title = doc_title,
-                    doc_num = doc_num,
-                    doc_type = doc_type,
-                    display_doc_type = display_doc_type, #
-                    publication_date = publication_date,
-                    cac_login_required = cac_login_required,
-                    crawler_used = self.name,
-                    downloadable_items = downloadable_items,
-                    source_page_url = source_page_url, #
-                    source_fqdn = source_fqdn, #
-                    download_url = download_url, #
-                    version_hash_raw_data = version_hash_fields, #
-                    version_hash = version_hash,
-                    display_org = display_org, #
-                    data_source = data_source, #
-                    source_title = source_title, #
-                    display_source = display_source, #
-                    display_title = display_title, #
-                    file_ext = file_ext, # Set to return pdf NOT html
-                    is_revoked = is_revoked, #
-        )