Skip to content

Commit

Permalink
Merge pull request #199 from dod-advana/anthony/slowdown_crawls
Browse files Browse the repository at this point in the history
implemented new crawler settings and add oneoff placeholder
  • Loading branch information
amaruca141 committed Oct 12, 2023
2 parents aec1397 + 147f9fe commit a7affa8
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions dataPipelines/gc_scrapy/gc_scrapy/runspider_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,14 @@
"ROBOTSTXT_OBEY": False,
"LOG_LEVEL": "INFO",
"DOWNLOAD_FAIL_ON_DATALOSS": False,

# Slow down crawler
"DOWNLOAD_DELAY": 0.1, # Delay between requests in seconds
"DOWNLOAD_TIMEOUT": 3.5, # Time till skip
"RETRY_ENABLE": True,
"RETRY_TIMES": 2,
"CONCURRENT_REQUESTS": 10,
}

selenium_settings = {
"SELENIUM_DRIVER_NAME": "chrome",
"SELENIUM_DRIVER_EXECUTABLE_PATH": "/usr/local/bin/chromedriver",
Expand All @@ -32,7 +38,6 @@
],
"DOWNLOADER_MIDDLEWARES": {
**general_settings["DOWNLOADER_MIDDLEWARES"],
# make sure the values are not clashing
"dataPipelines.gc_scrapy.gc_scrapy.downloader_middlewares.SeleniumMiddleware": max(
general_settings["DOWNLOADER_MIDDLEWARES"].values()
)
Expand Down
Empty file.

0 comments on commit a7affa8

Please sign in to comment.