Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch HASC Crawler #235

Merged
merged 5 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ def set_version_hash_field(self, key: str, value: str) -> dict:
"""Sets a new field or updates an old one in the dict used for hashing"""
self.hash_fields[key] = value

def remove_version_hash_field(self, key: str) -> None:
"""Removes a field from tthe dict used for hashing"""
self.hash_fields.pop(key)

def set_display_name(self, name: str) -> None:
"""Update display name for DocItemFields instance"""
self.display_title = name
Expand Down
252 changes: 100 additions & 152 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py
Original file line number Diff line number Diff line change
@@ -1,190 +1,138 @@
from calendar import day_abbr
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from typing import Any, Generator
from urllib.parse import urljoin
from datetime import datetime
import scrapy
import typing as t
from scrapy.http import Response

from urllib.parse import urljoin, urlparse
from datetime import datetime
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields

# only scrape witness statements
# 193 documents

class HASCSpider(GCSpider):
name = "HASC" # Crawler name

"""
As of 06/24/2024
crawls https://armedservices.house.gov/committee-activity/hearings/all for 179 pdfs (doc_type = Witness Statement)
"""

# Crawler name
name = "HASC"
# Level 1: GC app 'Source' filter for docs from this crawler
display_org = "Congress"
# Level 2: GC app 'Source' metadata field for docs from this crawler
data_source = "House Armed Services Committee Publications"
# Level 3 filter
source_title = "House Armed Services Committee"

allowed_domains = ["armedservices.house.gov"]
base_url = "https://armedservices.house.gov"
start_urls = [f"{base_url}/committee-activity/hearings/all?page=0"]

start_urls = [base_url]

randomly_delay_request = True
rotate_user_agent = True

randomly_delay_request = True
custom_settings = {
**GCSpider.custom_settings,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_START_DELAY": 10,
"AUTOTHROTTLE_MAX_DELAY": 60,
"CONCURRENT_REQUESTS_PER_DOMAIN": 1,
}

def parse(self, _):
pages_parser_map = [
(f"{self.base_url}/hearings", self.recursive_parse_hearings),
# (f"{self.base_url}/legislation",) # setup for future crawlers if needed
]

for page_url, parser_func in pages_parser_map:
yield scrapy.Request(page_url, callback=parser_func)

@staticmethod
def get_next_relative_url(response):
return response.css("li.pager-next > a::attr(href)").get()

def recursive_parse_hearings(self, response):

yield from self.parse_hearings_table_page(response)

next_relative_url = self.get_next_relative_url(response)
if next_relative_url:
next_url = f"{self.base_url}{next_relative_url}"
yield scrapy.Request(url=next_url, callback=self.recursive_parse_hearings)

def parse_hearings_table_page(self, response):

rows = response.css(
"div.view-content div")
def extract_doc_name_from_url(url: str) -> str:
"""Returns a doc name given a full URL"""
doc_name = url.split("/")[-1]
return (
doc_name.replace(".pdf", "")
.replace("%", "_")
.replace(".", "")
.replace("-", "")
)

def parse(self, response: Response) -> Generator[DocItem, Any, None]:
"""Recursively parses doc items out of House Armed Services Committee site"""
rows = response.css(".evo-views-row")

for row in rows:
try:
link = row.css("h3.field-content a::attr(href)").get()

link = row.css("div.h3.mt-0.font-weight-bold a::attr(href)").get()
if not link:
continue

follow_link = f"{self.base_url}{link}"
yield scrapy.Request(url=follow_link, callback=self.parse_hearing_detail_page)
yield scrapy.Request(url=follow_link, callback=self.parse_hearing_page)
except Exception as e:
print(e)

def extract_doc_name_from_url(self, url):
doc_name = url.split('/')[-1]
doc_name = doc_name.replace('.pdf', '').replace('%', '_').replace('.', '').replace('-', '')
return doc_name
# If data was found in this table then check the next page
if len(rows) > 0:
current_page_id = int(response.url[-1])
next_url = f"{response.url[0:-1]}{current_page_id+1}"
yield scrapy.Request(url=next_url, callback=self.parse)

def parse_hearing_detail_page(self, response):
def parse_hearing_page(self, response: Response) -> Generator[DocItem, Any, None]:
"""Parses all statements available given a hearing details page"""
try:
# Get the basic details like title and date from the page
title = self.ascii_clean(response.css("#page-title ::text").get())
date_el = response.css("span.date-display-single ::text").get()
date_split = date_el.split()
month, day, year = date_split[1], date_split[2], date_split[3]
date = f"{month} {day} {year}"
doc_title = self.ascii_clean(response.css("h1.display-4 ::text").get())
publication_date = datetime.strptime(
response.css("time ::text").get(), "%a, %m/%d/%Y - %I:%M %p"
)
doc_type = "Witness Statement"

# Extract names of speakers
names = response.css('b ::text').getall()
speaker_names = response.css("b ::text").getall()

# Find all <a> tags within <p> tags and check if they contain the word "statement" and point to a PDF
links = response.css("p a")
for link in links:
# Find all links and check if they contain the word "statement" and point to a PDF
for link in response.css("p a"):
href = link.css("::attr(href)").get()
link_text = link.css("::text").get("").lower() # Get the text and convert it to lower case for comparison

# Check if "statement" is in the link text and the href ends with ".pdf"
if "statement" in link_text and href and href.endswith(".pdf"):
# Check if any of the speaker names is in the link text
for name in names:
if name.lower() in link_text:
follow_link = urljoin(self.base_url, href)
display_title = self.ascii_clean(f"HASC {title} - {name}")
doc_name = self.extract_doc_name_from_url(follow_link)

# Set up the fields with the new PDF URL
fields = {
'doc_name': doc_name,
'doc_num': ' ', # No doc num for this crawler
'doc_title': title,
'doc_type': doc_type,
'cac_login_required': False,
'source_page_url': response.url,
'download_url': follow_link,
'publication_date': date,
'file_ext': 'pdf', # Set to return pdf NOT html
'display_title': display_title
}
# Instantiate DocItem class and assign document's metadata values
doc_item = self.populate_doc_item(fields)
if not href or not href.endswith(".pdf"):
continue

# Get the text and convert it to lower case for comparison
link_text = link.css("::text").get("").lower()
if "statement" not in link_text:
continue

yield doc_item
# Check if any of the speaker names is in the link text
for speaker_name in speaker_names:
if speaker_name.lower() not in link_text:
continue

follow_link = urljoin(self.base_url, href)
display_title = self.ascii_clean(f"HASC {doc_title} - {speaker_name}")
doc_name = self.extract_doc_name_from_url(follow_link)

fields = DocItemFields(
doc_name=doc_name,
doc_title=doc_title,
doc_num=" ",
doc_type=doc_type,
publication_date=publication_date,
cac_login_required=False,
source_page_url=response.url,
downloadable_items=[
{
"doc_type": "pdf",
"download_url": follow_link,
"compression_type": None,
}
],
download_url=follow_link,
file_ext="pdf",
display_doc_type=doc_type,
)
# Match fields to previous crawler iterations
fields.remove_version_hash_field("doc_num")
fields.set_version_hash_field("doc_title", doc_title)
fields.set_display_name(display_title)

yield fields.populate_doc_item(
display_org=self.display_org,
data_source=self.data_source,
source_title=self.source_title,
crawler_used=self.name,
)

except Exception as e:
print(e)


def populate_doc_item(self, fields):
# '''
# This functions provides both hardcoded and computed values for the variables
# in the imported DocItem object and returns the populated metadata object
# '''
display_org = "Congress" # Level 1: GC app 'Source' filter for docs from this crawler
data_source = "House Armed Services Committee Publications" # Level 2: GC app 'Source' metadata field for docs from this crawler
source_title = "House Armed Services Committee" # Level 3 filter

doc_name = fields['doc_name']
doc_num = fields['doc_num']
doc_title = fields['doc_title']
doc_type = fields['doc_type']
cac_login_required = fields['cac_login_required']
download_url = fields['download_url']
publication_date = get_pub_date(fields['publication_date'])

display_doc_type = fields['doc_type'] # Doc type for display on app
display_source = data_source + " - " + source_title
display_title = fields['display_title']
is_revoked = False
source_page_url = fields['source_page_url']
source_fqdn = urlparse(source_page_url).netloc

downloadable_items = [{
"doc_type": fields['file_ext'],
"download_url": download_url,
"compression_type": None,
}]
file_ext = fields['file_ext'] # Set to return pdf NOT html

## Assign fields that will be used for versioning
version_hash_fields = {
"doc_name":doc_name,
"doc_title": fields['doc_title'],
"publication_date": publication_date,
"download_url": download_url,
"display_title": display_title
}

version_hash = dict_to_sha256_hex_digest(version_hash_fields)

return DocItem(
doc_name = doc_name,
doc_title = doc_title,
doc_num = doc_num,
doc_type = doc_type,
display_doc_type = display_doc_type, #
publication_date = publication_date,
cac_login_required = cac_login_required,
crawler_used = self.name,
downloadable_items = downloadable_items,
source_page_url = source_page_url, #
source_fqdn = source_fqdn, #
download_url = download_url, #
version_hash_raw_data = version_hash_fields, #
version_hash = version_hash,
display_org = display_org, #
data_source = data_source, #
source_title = source_title, #
display_source = display_source, #
display_title = display_title, #
file_ext = file_ext, # Set to return pdf NOT html
is_revoked = is_revoked, #
)
Loading