Skip to content

Commit

Permalink
Merge pull request #178 from dod-advana/varunt/crawler_fixes
Browse files Browse the repository at this point in the history
SASC and CFR changes
  • Loading branch information
takao8 committed Jun 21, 2023
2 parents 99374a0 + 3ac9583 commit acb50da
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 17 deletions.
11 changes: 0 additions & 11 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/cfr_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@ class CFRSpider(GCSpider):
"x-requested-with": "XMLHttpRequest"
}

def start_requests(self):
yield scrapy.Request(url=self.start_urls[0], method='GET', headers=self.headers)

@staticmethod
def get_pub_date(publication_date):
'''
Expand Down Expand Up @@ -91,14 +88,6 @@ def get_package_ids(self, response):
yield response.follow(url=detail_url, callback=self.parse_detail_data, meta={"offset": 0, "year": year},
headers=self.headers)

# iterate offset
next_offset = current_offset + 1
next_offset_url = response.url.replace(
f'offset={current_offset}', f'offset={next_offset}')

yield response.follow(url=next_offset_url, callback=self.get_package_ids, meta={"offset": next_offset, "year": year},
headers=self.headers)

def parse_detail_data(self, response):
data = json.loads(response.body)
year = response.meta["year"]
Expand Down
12 changes: 6 additions & 6 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/sasc_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def parse_hearing_detail_page(self, response):
# Get the hearing detail page as a document

main = response.css("div.SiteLayout__main")
title_raw = self.ascii_clean(main.css("h1.Heading__title ::text").get().strip())
title_raw = self.ascii_clean(main.css("h1.Heading__title::text")[-1].get().strip())
title = ' '.join(title_raw.split())
date = main.css('div.Hearing__detail time::attr(datetime)').get()
spaced_title = f" - {title}" if title else ""
Expand Down Expand Up @@ -149,9 +149,9 @@ def parse_hearing_detail_page(self, response):
for witdoc in witness_docs:
witness_href = witdoc.css('a::attr(href)').get()
if witness_href is not None:
honorific = witblock.css('h4.Heading__title span:nth-child(1)::text').get()
wit_name = witblock.css('h4.Heading__title span:nth-child(2)::text').get()
member_name = witblock.css('h4.Heading__title ::text').get()
honorific = witblock.css('h3.Heading__title span:nth-child(1)::text').get()
wit_name = witblock.css('h3.Heading__title span:nth-child(2)::text').get()
member_name = witblock.css('h3.Heading__title ::text').get()

if honorific and wit_name is not None:
full_name_raw = f"{honorific} {wit_name}"
Expand Down Expand Up @@ -217,7 +217,7 @@ def populate_doc_item(self, fields):

display_doc_type = fields['display_doc_type'] # Doc type for display on app
display_source = data_source + " - " + source_title
display_title = doc_type + " - " + doc_title # Different than other crawlers due to lack of doc_num; added a dash for clarity
display_title = doc_type + ": " + doc_title # Different than other crawlers due to lack of doc_num; added a dash for clarity
is_revoked = False
source_page_url = fields['source_page_url']
source_fqdn = urlparse(source_page_url).netloc
Expand All @@ -233,7 +233,7 @@ def populate_doc_item(self, fields):
"download_url": download_url,
"display_title": display_title
}

version_hash = dict_to_sha256_hex_digest(version_hash_fields)

return DocItem(
Expand Down

0 comments on commit acb50da

Please sign in to comment.