Skip to content

Commit

Permalink
hasc spider patch
Browse files Browse the repository at this point in the history
  • Loading branch information
Varun Talwar committed May 17, 2024
1 parent f44c46f commit 217f47a
Showing 1 changed file with 15 additions and 2 deletions.
17 changes: 15 additions & 2 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ class HASCSpider(GCSpider):

randomly_delay_request = True
rotate_user_agent = True
custom_settings = {
**GCSpider.custom_settings,
"DOWNLOAD_DELAY": 1, # Wait at least 1 second between requests
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_START_DELAY": 1,
"AUTOTHROTTLE_MAX_DELAY": 10,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2,
}

def parse(self, _):
pages_parser_map = [
Expand Down Expand Up @@ -58,6 +66,11 @@ def parse_hearings_table_page(self, response):
except Exception as e:
print(e)

def extract_doc_name_from_url(self, url):
doc_name = url.split('/')[-1].split('.')[0]
doc_name = doc_name.replace('%', '_').replace('-', '')
return doc_name

def parse_hearing_detail_page(self, response):
try:
# Get the basic details like title and date from the page
Expand All @@ -84,7 +97,7 @@ def parse_hearing_detail_page(self, response):
if name.lower() in link_text:
follow_link = urljoin(self.base_url, href)
display_title = self.ascii_clean(f"HASC {title} - {name}")
doc_name = display_title
doc_name = self.extract_doc_name_from_url(follow_link)

# Set up the fields with the new PDF URL
fields = {
Expand Down Expand Up @@ -142,7 +155,7 @@ def populate_doc_item(self, fields):
## Assign fields that will be used for versioning
version_hash_fields = {
"doc_name":doc_name,
# "doc_num": doc_num,
"doc_title": fields['doc_title'],
"publication_date": publication_date,
"download_url": download_url,
"display_title": display_title
Expand Down

0 comments on commit 217f47a

Please sign in to comment.