Skip to content

Commit

Permalink
Merge pull request #201 from dod-advana/anthony/jcs_slowdown
Browse files Browse the repository at this point in the history
added delay for crawler to prevent blacklist, added oneoff placeholder
  • Loading branch information
amaruca141 committed Oct 17, 2023
2 parents a7affa8 + 8888a92 commit ecaba71
Showing 1 changed file with 3 additions and 0 deletions.
3 changes: 3 additions & 0 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/jcs_pubs_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from datetime import datetime
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date

import time
doc_type_num_re = re.compile(r'(.*)\s(\d+.*)')


Expand Down Expand Up @@ -39,6 +40,7 @@ def parse(self, response):
]

for link in doc_links:
time.sleep(20) # Slow crawler down to prevent blacklist
yield response.follow(url=link, callback=self.parse_doc_table_page)

def parse_doc_table_page(self, response):
Expand Down Expand Up @@ -93,6 +95,7 @@ def parse_doc_table_page(self, response):
a for a in nav_table.css('a.CommandButton')
if a.css('::text').get() == 'Next'
)
time.sleep(20) # Slow crawler down to prevent blacklist
yield response.follow(url=next_page_link,
callback=self.parse_doc_table_page)
except:
Expand Down

0 comments on commit ecaba71

Please sign in to comment.