Skip to content

Commit

Permalink
simplify and format
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew Kersting committed Jun 3, 2024
1 parent 4f4d8a4 commit 6254710
Showing 1 changed file with 6 additions and 12 deletions.
18 changes: 6 additions & 12 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/ufc_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ class UFCSpider(GCSpider):
allowed_domains = [domain]
start_urls = [urljoin(base_url, "/ffc/dod/unified-facilities-criteria-ufc")]

rotate_user_agent = True
date_format = "%m/%d/%y"

def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]:
"""Parses UFC doc items out of Whole Building Design Guide site"""
page_url = response.url
Expand All @@ -57,7 +54,6 @@ def parse_table(

try:
full_title = cells[0].get_text().strip()
acronym = full_title.split(" ")[0]
doc_num = full_title.split(" ")[1]
doc_title = " ".join(full_title.split(" ")[2:])
publication_date = parse_timestamp(cells[1].get_text().strip())
Expand All @@ -70,12 +66,16 @@ def parse_table(
doc_name=full_title,
doc_title=self.ascii_clean(doc_title),
doc_num=doc_num,
doc_type=self.get_doc_type(acronym),
doc_type="Document",
publication_date=publication_date,
cac_login_required=False,
source_page_url=url,
downloadable_items=[
{"doc_type": "pdf", "download_url": url, "compression_type": None}
{
"doc_type": "pdf",
"download_url": url,
"compression_type": None,
}
],
download_url=url,
file_ext="pdf",
Expand All @@ -95,9 +95,3 @@ def parse_table(
callback=self.parse_table,
meta={"page_id": page_id},
)

def get_doc_type(self, doc_name: str) -> str:
"""Takes in the doc name and returns the type, only handles DISAC and DISAI docs"""
if "FC" in doc_name:
return "Document"
raise ValueError(f"Unexpected value for doc_name {doc_name}")

0 comments on commit 6254710

Please sign in to comment.