-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #235 from dod-advana/patch-hasc-crawler
Patch HASC Crawler
- Loading branch information
Showing
2 changed files
with
104 additions
and
152 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
252 changes: 100 additions & 152 deletions
252
dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,190 +1,138 @@ | ||
from calendar import day_abbr | ||
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider | ||
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem | ||
from typing import Any, Generator | ||
from urllib.parse import urljoin | ||
from datetime import datetime | ||
import scrapy | ||
import typing as t | ||
from scrapy.http import Response | ||
|
||
from urllib.parse import urljoin, urlparse | ||
from datetime import datetime | ||
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date | ||
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem | ||
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider | ||
from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields | ||
|
||
# only scrape witness statements | ||
# 193 documents | ||
|
||
class HASCSpider(GCSpider): | ||
name = "HASC" # Crawler name | ||
|
||
""" | ||
As of 06/24/2024 | ||
crawls https://armedservices.house.gov/committee-activity/hearings/all for 179 pdfs (doc_type = Witness Statement) | ||
""" | ||
|
||
# Crawler name | ||
name = "HASC" | ||
# Level 1: GC app 'Source' filter for docs from this crawler | ||
display_org = "Congress" | ||
# Level 2: GC app 'Source' metadata field for docs from this crawler | ||
data_source = "House Armed Services Committee Publications" | ||
# Level 3 filter | ||
source_title = "House Armed Services Committee" | ||
|
||
allowed_domains = ["armedservices.house.gov"] | ||
base_url = "https://armedservices.house.gov" | ||
start_urls = [f"{base_url}/committee-activity/hearings/all?page=0"] | ||
|
||
start_urls = [base_url] | ||
|
||
randomly_delay_request = True | ||
rotate_user_agent = True | ||
|
||
randomly_delay_request = True | ||
custom_settings = { | ||
**GCSpider.custom_settings, | ||
"AUTOTHROTTLE_ENABLED": True, | ||
"AUTOTHROTTLE_ENABLED": True, | ||
"AUTOTHROTTLE_START_DELAY": 10, | ||
"AUTOTHROTTLE_MAX_DELAY": 60, | ||
"CONCURRENT_REQUESTS_PER_DOMAIN": 1, | ||
} | ||
|
||
def parse(self, _): | ||
pages_parser_map = [ | ||
(f"{self.base_url}/hearings", self.recursive_parse_hearings), | ||
# (f"{self.base_url}/legislation",) # setup for future crawlers if needed | ||
] | ||
|
||
for page_url, parser_func in pages_parser_map: | ||
yield scrapy.Request(page_url, callback=parser_func) | ||
|
||
@staticmethod | ||
def get_next_relative_url(response): | ||
return response.css("li.pager-next > a::attr(href)").get() | ||
|
||
def recursive_parse_hearings(self, response): | ||
|
||
yield from self.parse_hearings_table_page(response) | ||
|
||
next_relative_url = self.get_next_relative_url(response) | ||
if next_relative_url: | ||
next_url = f"{self.base_url}{next_relative_url}" | ||
yield scrapy.Request(url=next_url, callback=self.recursive_parse_hearings) | ||
|
||
def parse_hearings_table_page(self, response): | ||
|
||
rows = response.css( | ||
"div.view-content div") | ||
def extract_doc_name_from_url(url: str) -> str: | ||
"""Returns a doc name given a full URL""" | ||
doc_name = url.split("/")[-1] | ||
return ( | ||
doc_name.replace(".pdf", "") | ||
.replace("%", "_") | ||
.replace(".", "") | ||
.replace("-", "") | ||
) | ||
|
||
def parse(self, response: Response) -> Generator[DocItem, Any, None]: | ||
"""Recursively parses doc items out of House Armed Services Committee site""" | ||
rows = response.css(".evo-views-row") | ||
|
||
for row in rows: | ||
try: | ||
link = row.css("h3.field-content a::attr(href)").get() | ||
|
||
link = row.css("div.h3.mt-0.font-weight-bold a::attr(href)").get() | ||
if not link: | ||
continue | ||
|
||
follow_link = f"{self.base_url}{link}" | ||
yield scrapy.Request(url=follow_link, callback=self.parse_hearing_detail_page) | ||
yield scrapy.Request(url=follow_link, callback=self.parse_hearing_page) | ||
except Exception as e: | ||
print(e) | ||
|
||
def extract_doc_name_from_url(self, url): | ||
doc_name = url.split('/')[-1] | ||
doc_name = doc_name.replace('.pdf', '').replace('%', '_').replace('.', '').replace('-', '') | ||
return doc_name | ||
# If data was found in this table then check the next page | ||
if len(rows) > 0: | ||
current_page_id = int(response.url[-1]) | ||
next_url = f"{response.url[0:-1]}{current_page_id+1}" | ||
yield scrapy.Request(url=next_url, callback=self.parse) | ||
|
||
def parse_hearing_detail_page(self, response): | ||
def parse_hearing_page(self, response: Response) -> Generator[DocItem, Any, None]: | ||
"""Parses all statements available given a hearing details page""" | ||
try: | ||
# Get the basic details like title and date from the page | ||
title = self.ascii_clean(response.css("#page-title ::text").get()) | ||
date_el = response.css("span.date-display-single ::text").get() | ||
date_split = date_el.split() | ||
month, day, year = date_split[1], date_split[2], date_split[3] | ||
date = f"{month} {day} {year}" | ||
doc_title = self.ascii_clean(response.css("h1.display-4 ::text").get()) | ||
publication_date = datetime.strptime( | ||
response.css("time ::text").get(), "%a, %m/%d/%Y - %I:%M %p" | ||
) | ||
doc_type = "Witness Statement" | ||
|
||
# Extract names of speakers | ||
names = response.css('b ::text').getall() | ||
speaker_names = response.css("b ::text").getall() | ||
|
||
# Find all <a> tags within <p> tags and check if they contain the word "statement" and point to a PDF | ||
links = response.css("p a") | ||
for link in links: | ||
# Find all links and check if they contain the word "statement" and point to a PDF | ||
for link in response.css("p a"): | ||
href = link.css("::attr(href)").get() | ||
link_text = link.css("::text").get("").lower() # Get the text and convert it to lower case for comparison | ||
|
||
# Check if "statement" is in the link text and the href ends with ".pdf" | ||
if "statement" in link_text and href and href.endswith(".pdf"): | ||
# Check if any of the speaker names is in the link text | ||
for name in names: | ||
if name.lower() in link_text: | ||
follow_link = urljoin(self.base_url, href) | ||
display_title = self.ascii_clean(f"HASC {title} - {name}") | ||
doc_name = self.extract_doc_name_from_url(follow_link) | ||
|
||
# Set up the fields with the new PDF URL | ||
fields = { | ||
'doc_name': doc_name, | ||
'doc_num': ' ', # No doc num for this crawler | ||
'doc_title': title, | ||
'doc_type': doc_type, | ||
'cac_login_required': False, | ||
'source_page_url': response.url, | ||
'download_url': follow_link, | ||
'publication_date': date, | ||
'file_ext': 'pdf', # Set to return pdf NOT html | ||
'display_title': display_title | ||
} | ||
# Instantiate DocItem class and assign document's metadata values | ||
doc_item = self.populate_doc_item(fields) | ||
if not href or not href.endswith(".pdf"): | ||
continue | ||
|
||
# Get the text and convert it to lower case for comparison | ||
link_text = link.css("::text").get("").lower() | ||
if "statement" not in link_text: | ||
continue | ||
|
||
yield doc_item | ||
# Check if any of the speaker names is in the link text | ||
for speaker_name in speaker_names: | ||
if speaker_name.lower() not in link_text: | ||
continue | ||
|
||
follow_link = urljoin(self.base_url, href) | ||
display_title = self.ascii_clean(f"HASC {doc_title} - {speaker_name}") | ||
doc_name = self.extract_doc_name_from_url(follow_link) | ||
|
||
fields = DocItemFields( | ||
doc_name=doc_name, | ||
doc_title=doc_title, | ||
doc_num=" ", | ||
doc_type=doc_type, | ||
publication_date=publication_date, | ||
cac_login_required=False, | ||
source_page_url=response.url, | ||
downloadable_items=[ | ||
{ | ||
"doc_type": "pdf", | ||
"download_url": follow_link, | ||
"compression_type": None, | ||
} | ||
], | ||
download_url=follow_link, | ||
file_ext="pdf", | ||
display_doc_type=doc_type, | ||
) | ||
# Match fields to previous crawler iterations | ||
fields.remove_version_hash_field("doc_num") | ||
fields.set_version_hash_field("doc_title", doc_title) | ||
fields.set_display_name(display_title) | ||
|
||
yield fields.populate_doc_item( | ||
display_org=self.display_org, | ||
data_source=self.data_source, | ||
source_title=self.source_title, | ||
crawler_used=self.name, | ||
) | ||
|
||
except Exception as e: | ||
print(e) | ||
|
||
|
||
def populate_doc_item(self, fields): | ||
# ''' | ||
# This functions provides both hardcoded and computed values for the variables | ||
# in the imported DocItem object and returns the populated metadata object | ||
# ''' | ||
display_org = "Congress" # Level 1: GC app 'Source' filter for docs from this crawler | ||
data_source = "House Armed Services Committee Publications" # Level 2: GC app 'Source' metadata field for docs from this crawler | ||
source_title = "House Armed Services Committee" # Level 3 filter | ||
|
||
doc_name = fields['doc_name'] | ||
doc_num = fields['doc_num'] | ||
doc_title = fields['doc_title'] | ||
doc_type = fields['doc_type'] | ||
cac_login_required = fields['cac_login_required'] | ||
download_url = fields['download_url'] | ||
publication_date = get_pub_date(fields['publication_date']) | ||
|
||
display_doc_type = fields['doc_type'] # Doc type for display on app | ||
display_source = data_source + " - " + source_title | ||
display_title = fields['display_title'] | ||
is_revoked = False | ||
source_page_url = fields['source_page_url'] | ||
source_fqdn = urlparse(source_page_url).netloc | ||
|
||
downloadable_items = [{ | ||
"doc_type": fields['file_ext'], | ||
"download_url": download_url, | ||
"compression_type": None, | ||
}] | ||
file_ext = fields['file_ext'] # Set to return pdf NOT html | ||
|
||
## Assign fields that will be used for versioning | ||
version_hash_fields = { | ||
"doc_name":doc_name, | ||
"doc_title": fields['doc_title'], | ||
"publication_date": publication_date, | ||
"download_url": download_url, | ||
"display_title": display_title | ||
} | ||
|
||
version_hash = dict_to_sha256_hex_digest(version_hash_fields) | ||
|
||
return DocItem( | ||
doc_name = doc_name, | ||
doc_title = doc_title, | ||
doc_num = doc_num, | ||
doc_type = doc_type, | ||
display_doc_type = display_doc_type, # | ||
publication_date = publication_date, | ||
cac_login_required = cac_login_required, | ||
crawler_used = self.name, | ||
downloadable_items = downloadable_items, | ||
source_page_url = source_page_url, # | ||
source_fqdn = source_fqdn, # | ||
download_url = download_url, # | ||
version_hash_raw_data = version_hash_fields, # | ||
version_hash = version_hash, | ||
display_org = display_org, # | ||
data_source = data_source, # | ||
source_title = source_title, # | ||
display_source = display_source, # | ||
display_title = display_title, # | ||
file_ext = file_ext, # Set to return pdf NOT html | ||
is_revoked = is_revoked, # | ||
) |