Skip to content

Commit

Permalink
Merge pull request #235 from dod-advana/patch-hasc-crawler
Browse files Browse the repository at this point in the history
Patch HASC Crawler
  • Loading branch information
emmarez committed Jun 26, 2024
2 parents df5e863 + 9321fb2 commit 71d7878
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 152 deletions.
4 changes: 4 additions & 0 deletions dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ def set_version_hash_field(self, key: str, value: str) -> dict:
"""Sets a new field or updates an old one in the dict used for hashing"""
self.hash_fields[key] = value

def remove_version_hash_field(self, key: str) -> None:
"""Removes a field from tthe dict used for hashing"""
self.hash_fields.pop(key)

def set_display_name(self, name: str) -> None:
"""Update display name for DocItemFields instance"""
self.display_title = name
Expand Down
252 changes: 100 additions & 152 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py
Original file line number Diff line number Diff line change
@@ -1,190 +1,138 @@
from calendar import day_abbr
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from typing import Any, Generator
from urllib.parse import urljoin
from datetime import datetime
import scrapy
import typing as t
from scrapy.http import Response

from urllib.parse import urljoin, urlparse
from datetime import datetime
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields

# only scrape witness statements
# 193 documents

class HASCSpider(GCSpider):
name = "HASC" # Crawler name

"""
As of 06/24/2024
crawls https://armedservices.house.gov/committee-activity/hearings/all for 179 pdfs (doc_type = Witness Statement)
"""

# Crawler name
name = "HASC"
# Level 1: GC app 'Source' filter for docs from this crawler
display_org = "Congress"
# Level 2: GC app 'Source' metadata field for docs from this crawler
data_source = "House Armed Services Committee Publications"
# Level 3 filter
source_title = "House Armed Services Committee"

allowed_domains = ["armedservices.house.gov"]
base_url = "https://armedservices.house.gov"
start_urls = [f"{base_url}/committee-activity/hearings/all?page=0"]

start_urls = [base_url]

randomly_delay_request = True
rotate_user_agent = True

randomly_delay_request = True
custom_settings = {
**GCSpider.custom_settings,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_START_DELAY": 10,
"AUTOTHROTTLE_MAX_DELAY": 60,
"CONCURRENT_REQUESTS_PER_DOMAIN": 1,
}

def parse(self, _):
pages_parser_map = [
(f"{self.base_url}/hearings", self.recursive_parse_hearings),
# (f"{self.base_url}/legislation",) # setup for future crawlers if needed
]

for page_url, parser_func in pages_parser_map:
yield scrapy.Request(page_url, callback=parser_func)

@staticmethod
def get_next_relative_url(response):
return response.css("li.pager-next > a::attr(href)").get()

def recursive_parse_hearings(self, response):

yield from self.parse_hearings_table_page(response)

next_relative_url = self.get_next_relative_url(response)
if next_relative_url:
next_url = f"{self.base_url}{next_relative_url}"
yield scrapy.Request(url=next_url, callback=self.recursive_parse_hearings)

def parse_hearings_table_page(self, response):

rows = response.css(
"div.view-content div")
def extract_doc_name_from_url(url: str) -> str:
"""Returns a doc name given a full URL"""
doc_name = url.split("/")[-1]
return (
doc_name.replace(".pdf", "")
.replace("%", "_")
.replace(".", "")
.replace("-", "")
)

def parse(self, response: Response) -> Generator[DocItem, Any, None]:
"""Recursively parses doc items out of House Armed Services Committee site"""
rows = response.css(".evo-views-row")

for row in rows:
try:
link = row.css("h3.field-content a::attr(href)").get()

link = row.css("div.h3.mt-0.font-weight-bold a::attr(href)").get()
if not link:
continue

follow_link = f"{self.base_url}{link}"
yield scrapy.Request(url=follow_link, callback=self.parse_hearing_detail_page)
yield scrapy.Request(url=follow_link, callback=self.parse_hearing_page)
except Exception as e:
print(e)

def extract_doc_name_from_url(self, url):
doc_name = url.split('/')[-1]
doc_name = doc_name.replace('.pdf', '').replace('%', '_').replace('.', '').replace('-', '')
return doc_name
# If data was found in this table then check the next page
if len(rows) > 0:
current_page_id = int(response.url[-1])
next_url = f"{response.url[0:-1]}{current_page_id+1}"
yield scrapy.Request(url=next_url, callback=self.parse)

def parse_hearing_detail_page(self, response):
def parse_hearing_page(self, response: Response) -> Generator[DocItem, Any, None]:
"""Parses all statements available given a hearing details page"""
try:
# Get the basic details like title and date from the page
title = self.ascii_clean(response.css("#page-title ::text").get())
date_el = response.css("span.date-display-single ::text").get()
date_split = date_el.split()
month, day, year = date_split[1], date_split[2], date_split[3]
date = f"{month} {day} {year}"
doc_title = self.ascii_clean(response.css("h1.display-4 ::text").get())
publication_date = datetime.strptime(
response.css("time ::text").get(), "%a, %m/%d/%Y - %I:%M %p"
)
doc_type = "Witness Statement"

# Extract names of speakers
names = response.css('b ::text').getall()
speaker_names = response.css("b ::text").getall()

# Find all <a> tags within <p> tags and check if they contain the word "statement" and point to a PDF
links = response.css("p a")
for link in links:
# Find all links and check if they contain the word "statement" and point to a PDF
for link in response.css("p a"):
href = link.css("::attr(href)").get()
link_text = link.css("::text").get("").lower() # Get the text and convert it to lower case for comparison

# Check if "statement" is in the link text and the href ends with ".pdf"
if "statement" in link_text and href and href.endswith(".pdf"):
# Check if any of the speaker names is in the link text
for name in names:
if name.lower() in link_text:
follow_link = urljoin(self.base_url, href)
display_title = self.ascii_clean(f"HASC {title} - {name}")
doc_name = self.extract_doc_name_from_url(follow_link)

# Set up the fields with the new PDF URL
fields = {
'doc_name': doc_name,
'doc_num': ' ', # No doc num for this crawler
'doc_title': title,
'doc_type': doc_type,
'cac_login_required': False,
'source_page_url': response.url,
'download_url': follow_link,
'publication_date': date,
'file_ext': 'pdf', # Set to return pdf NOT html
'display_title': display_title
}
# Instantiate DocItem class and assign document's metadata values
doc_item = self.populate_doc_item(fields)
if not href or not href.endswith(".pdf"):
continue

# Get the text and convert it to lower case for comparison
link_text = link.css("::text").get("").lower()
if "statement" not in link_text:
continue

yield doc_item
# Check if any of the speaker names is in the link text
for speaker_name in speaker_names:
if speaker_name.lower() not in link_text:
continue

follow_link = urljoin(self.base_url, href)
display_title = self.ascii_clean(f"HASC {doc_title} - {speaker_name}")
doc_name = self.extract_doc_name_from_url(follow_link)

fields = DocItemFields(
doc_name=doc_name,
doc_title=doc_title,
doc_num=" ",
doc_type=doc_type,
publication_date=publication_date,
cac_login_required=False,
source_page_url=response.url,
downloadable_items=[
{
"doc_type": "pdf",
"download_url": follow_link,
"compression_type": None,
}
],
download_url=follow_link,
file_ext="pdf",
display_doc_type=doc_type,
)
# Match fields to previous crawler iterations
fields.remove_version_hash_field("doc_num")
fields.set_version_hash_field("doc_title", doc_title)
fields.set_display_name(display_title)

yield fields.populate_doc_item(
display_org=self.display_org,
data_source=self.data_source,
source_title=self.source_title,
crawler_used=self.name,
)

except Exception as e:
print(e)


def populate_doc_item(self, fields):
# '''
# This functions provides both hardcoded and computed values for the variables
# in the imported DocItem object and returns the populated metadata object
# '''
display_org = "Congress" # Level 1: GC app 'Source' filter for docs from this crawler
data_source = "House Armed Services Committee Publications" # Level 2: GC app 'Source' metadata field for docs from this crawler
source_title = "House Armed Services Committee" # Level 3 filter

doc_name = fields['doc_name']
doc_num = fields['doc_num']
doc_title = fields['doc_title']
doc_type = fields['doc_type']
cac_login_required = fields['cac_login_required']
download_url = fields['download_url']
publication_date = get_pub_date(fields['publication_date'])

display_doc_type = fields['doc_type'] # Doc type for display on app
display_source = data_source + " - " + source_title
display_title = fields['display_title']
is_revoked = False
source_page_url = fields['source_page_url']
source_fqdn = urlparse(source_page_url).netloc

downloadable_items = [{
"doc_type": fields['file_ext'],
"download_url": download_url,
"compression_type": None,
}]
file_ext = fields['file_ext'] # Set to return pdf NOT html

## Assign fields that will be used for versioning
version_hash_fields = {
"doc_name":doc_name,
"doc_title": fields['doc_title'],
"publication_date": publication_date,
"download_url": download_url,
"display_title": display_title
}

version_hash = dict_to_sha256_hex_digest(version_hash_fields)

return DocItem(
doc_name = doc_name,
doc_title = doc_title,
doc_num = doc_num,
doc_type = doc_type,
display_doc_type = display_doc_type, #
publication_date = publication_date,
cac_login_required = cac_login_required,
crawler_used = self.name,
downloadable_items = downloadable_items,
source_page_url = source_page_url, #
source_fqdn = source_fqdn, #
download_url = download_url, #
version_hash_raw_data = version_hash_fields, #
version_hash = version_hash,
display_org = display_org, #
data_source = data_source, #
source_title = source_title, #
display_source = display_source, #
display_title = display_title, #
file_ext = file_ext, # Set to return pdf NOT html
is_revoked = is_revoked, #
)

0 comments on commit 71d7878

Please sign in to comment.