Skip to content

Commit

Permalink
remove unused import, clean up implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew Kersting committed May 23, 2024
1 parent 05163db commit d0aea9a
Showing 1 changed file with 6 additions and 15 deletions.
21 changes: 6 additions & 15 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/navy_med_spider.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
from typing import Any, Generator
import time
from urllib.parse import urljoin, urlparse
from datetime import datetime
from scrapy import Selector
from scrapy.http import Response
import bs4
from scrapy.http import Response

from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import Chrome
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
Expand All @@ -20,7 +15,7 @@

class NavyMedSpider(GCSeleniumSpider):
"""
As of 05/22/2024
As of 05/23/2024
crawls https://www.med.navy.mil/Directives for 398 pdfs total: 318 pdfs (doc_type = BUMEDINST),
22 pdfs(doc_type = BUMEDNOTE) & and 58 pdfs (doc_type = NAVMED)
"""
Expand Down Expand Up @@ -126,7 +121,7 @@ def parse_tab(

except Exception as exc:
raise NoSuchElementException(
f"Failed to find table to scrape from using css selector: {self.tabs_ul_selector}"
f"Failed to find table to scrape from using selector: {self.tabs_ul_selector}"
) from exc
try:
if has_next_page:
Expand All @@ -136,7 +131,7 @@ def parse_tab(
print("Could not go to next page: " + e)

def parse_table(
self, driver: Chrome, doc_type, index
self, driver: Chrome, doc_type: str, index: int
) -> Generator[DocItem, Any, None]:
"""Parse table for all documents"""
soup = bs4.BeautifulSoup(driver.page_source, features="html.parser")
Expand All @@ -148,13 +143,9 @@ def parse_table(
bumednote_seen = set({})
dup_change_seen = False
if doc_type == "NAVMED":
title_id = 1
publication_id = 0
doc_num_id = 2
title_id, publication_id, doc_num_id = 1, 0, 2
else:
title_id = 2
publication_id = 3
doc_num_id = 1
title_id, publication_id, doc_num_id = 2, 3, 1

for row in rows:
cells = row.find_all("td")
Expand Down

0 comments on commit d0aea9a

Please sign in to comment.