Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch navy med spider #232

Merged
merged 5 commits into from
Jun 12, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
291 changes: 145 additions & 146 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/navy_med_spider.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,93 @@
from scrapy import Selector
from typing import Any, Generator
import time
import bs4
from scrapy.http import Response

from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.GCSeleniumSpider import GCSeleniumSpider
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import Chrome
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
import time

from urllib.parse import urljoin, urlparse
from datetime import datetime
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields
from dataPipelines.gc_scrapy.gc_scrapy.GCSeleniumSpider import GCSeleniumSpider
from dataPipelines.gc_scrapy.gc_scrapy.utils import parse_timestamp


class NavyMedSpider(GCSeleniumSpider):
name = "navy_med_pubs" # Crawler name

start_urls = [
"https://www.med.navy.mil/Directives/"
]
tabs_ul_selector = 'ul.z-tabs-nav.z-tabs-desktop'
"""
As of 05/23/2024
crawls https://www.med.navy.mil/Directives for 402 pdfs total: 321 pdfs (doc_type = BUMEDINST),
24 pdfs(doc_type = BUMEDNOTE) & and 57 pdfs (doc_type = NAVMED)
Note: BUMEDINST has 322 docs in the table but one requries a CAC
"""

# Crawler name
name = "navy_med_pubs"
# Level 1: GC app 'Source' filter for docs from this crawler
display_org = "US Navy Medicine"
# Level 2: GC app 'Source' metadata field for docs from this crawler
data_source = "Navy Medicine"
# Level 3 filter
source_title = "Unlisted Source"

start_urls = ["https://www.med.navy.mil/Directives/"]
tabs_ul_selector = "ul.z-tabs-nav.z-tabs-desktop"

# selenium_request_overrides = {
# "wait_until": EC.element_to_be_clickable(
# (By.CSS_SELECTOR, tabs_ul_selector))
# }

tabs_parsed = set({})
tabs_doc_type_dict = {
"BUMED Instructions": "BUMEDINST",
"BUMED Notices (Notes)": "BUMEDNOTE",
"All Pubs and Manuals": "NAVMED"
"All Pubs and Manuals": "NAVMED",
}

rotate_user_agent = True
randomly_delay_request = True
custom_settings = {
**GCSeleniumSpider.custom_settings,
"DOWNLOAD_TIMEOUT": 7.0,
}

def get_tab_button_els(self, driver: Chrome):
def get_tab_button_els(self, driver: Chrome) -> list:
"""Return list of all tab elements"""
tab_button_els = driver.find_elements_by_css_selector(
f"{self.tabs_ul_selector} li a")
f"{self.tabs_ul_selector} li a"
)

tabs_to_click = [
el for el in tab_button_els if el.get_attribute("textContent") in self.tabs_doc_type_dict.keys()
el
for el in tab_button_els
if el.get_attribute("textContent") in self.tabs_doc_type_dict
]

return tabs_to_click

def parse(self, response):
def get_next_page_anchor(self, driver: Chrome) -> Generator[DocItem, Any, None]:
"""Find the next button"""
els = driver.find_elements_by_css_selector(
"table.PagingTable tr td:nth-child(2) a"
)

try:
next_button_el = next(iter([el for el in els if el.text == "Next"]))

return next_button_el
except Exception as exc:
raise NoSuchElementException from exc

def parse(self, response: Response) -> Generator[DocItem, Any, None]:
"""Parse doc items out of navy med pubs site"""
driver: Chrome = response.meta["driver"]

for i, doc_type in enumerate(self.tabs_doc_type_dict.values()):
# must re-grab button ref if page has changed (table paged etc)
driver.get(self.start_urls[0]) # navigating to the homepage again to reset the page (because refresh doesn't work)
time.sleep(5) # waiting to be sure that it loaded
driver.get(
self.start_urls[0]
) # navigating to the homepage again to reset the page (because refresh doesn't work)
time.sleep(8) # waiting to be sure that it loaded
try:
button = self.get_tab_button_els(driver)[i]
except Exception as e:
print(doc_type)
print(self.tabs_ul_selector)
print("Error when getting tab button: " + e)
try:
ActionChains(driver).move_to_element(button).click(button).perform()
Expand All @@ -67,82 +101,96 @@ def parse(self, response):
except Exception as e:
print("error when getting items: " + e)

def get_next_page_anchor(self, driver):
els = driver.find_elements_by_css_selector(
'table.PagingTable tr td:nth-child(2) a')

try:
next_button_el = next(
iter([el for el in els if el.text == 'Next']))

return next_button_el
except Exception as e:
raise NoSuchElementException

def parse_tab(self, driver: Chrome, doc_type, index):
def parse_tab(
self, driver: Chrome, doc_type: str, index: int
) -> Generator[DocItem, Any, None]:
"""After selecting a tab iterate through each page and parse the table"""
has_next_page = True
page_num = 1

while(has_next_page):
while has_next_page:
try:
next_page_el = self.get_next_page_anchor(driver)

except NoSuchElementException as e:
except NoSuchElementException:
# expected when on last page, set exit condition then parse table
has_next_page = False

try:
for item in self.parse_table(driver, doc_type, index):
yield item

except Exception:
except Exception as exc:
raise NoSuchElementException(
f"Failed to find table to scrape from using css selector: {self.table_selector}"
)
f"Failed to find table to scrape from using selector: {self.tabs_ul_selector}"
) from exc
try:
if has_next_page:
next_page_el.click()
page_num += 1
except Exception as e:
print("Could not go to next page: " + e)

def parse_table(self, driver: Chrome, doc_type, index):
response = Selector(text=driver.page_source)
rows_selector = f'table#dnn_ctr48257_ViewTabs_rptTabBody_Default_{index}_List_{index}_grdData_{index} tr'
rows = response.css(rows_selector)
def parse_table(
self, driver: Chrome, doc_type: str, index: int
) -> Generator[DocItem, Any, None]:
"""Parse table for all documents"""
soup = bs4.BeautifulSoup(driver.page_source, features="html.parser")
element = soup.find(
id=f"dnn_ctr48257_ViewTabs_rptTabBody_Default_{index}_List_{index}_OuterDiv_{index}"
)
rows = element.find_all("tr")

bumednote_seen = set({})
dup_change_seen = False
if doc_type == "NAVMED":
title_id, publication_id, doc_num_id = 1, 0, 2
else:
title_id, publication_id, doc_num_id = 2, 3, 1

for row in rows:
doc_num_raw: str = row.css('td:nth-child(1)::text').get(default='')
cells = row.find_all("td")
if len(cells) == 0:
continue
doc_num_cell = cells[doc_num_id]
title_cell = cells[title_id]
publication_date_cell = cells[publication_id]
doc_num_raw: str = doc_num_cell.get_text().strip()
if not doc_num_raw:
print("doc num is null, skipping")
continue

doc_title_raw = row.css(
'td:nth-child(2)::text').get(default='')
publication_date = row.css('td:nth-child(3)::text').get(default='')
href_raw = row.css('td:nth-child(4) a::attr(href)').get()
doc_title_raw = title_cell.get_text().strip()
publication_date = publication_date_cell.get_text().strip()
try:
href_raw = doc_num_cell.find_all("a")[0]["href"]
except IndexError:
# Looks for href in full row
try:
href_raw = row.find_all("a")[0]["href"]
except IndexError:
print(f"Could not find link to document {doc_title_raw}")
continue

doc_name = None
doc_num = None
doc_title = None

# Changes for each tab
# BUMEDINST
if index == 0:
doc_num_raw = doc_num_raw.split()[0]
# BUMEDNOTE
elif index == 1:
doc_num_raw = doc_num_raw.replace('NOTE ', '')
if index == 1:
doc_num_raw = doc_num_raw.replace("NOTE ", "")
# BUMEDNOTE has a lot of duplicate nums with completely different docs
if doc_num_raw in bumednote_seen:
doc_num_raw = f"{doc_num_raw} {doc_title_raw}"
if doc_num_raw in bumednote_seen:
doc_num_raw = f"{doc_num_raw}-REVISION"

bumednote_seen.add(doc_num_raw)

# NAVMED
elif index == 2:
doc_num_raw = doc_num_raw.replace('.pdf', '')
doc_num_raw = doc_num_raw.replace(".pdf", "")
publication_date, doc_title_raw = doc_title_raw, publication_date

if doc_num_raw[0].isdigit():
Expand All @@ -155,9 +203,14 @@ def parse_table(self, driver: Chrome, doc_type, index):
doc_name = f"{ref_name} {doc_num_raw}"

# special case to match old crawler
if doc_name == "NAVMED P-117 MANMED CHANGE 126" and not dup_change_seen:
if (
doc_name == "NAVMED P-117 MANMED CHANGE 126"
and not dup_change_seen
):
dup_change_seen = True
elif doc_name == "NAVMED P-117 MANMED CHANGE 126" and dup_change_seen:
elif (
doc_name == "NAVMED P-117 MANMED CHANGE 126" and dup_change_seen
):
doc_name = "NAVMED P-117 MANMED CHANGE 126-1"

if not doc_num:
Expand All @@ -166,97 +219,43 @@ def parse_table(self, driver: Chrome, doc_type, index):
doc_title = self.ascii_clean(doc_title_raw)

if not href_raw:
print("href is null, skipping")
continue

download_url = self.ensure_full_href_url(
href_raw, self.start_urls[0])
download_url = self.ensure_full_href_url(href_raw, self.start_urls[0])

if not doc_name:
doc_name = f"{doc_type} {doc_num}"

cac_login_required = False
if doc_title.endswith('*'):
if doc_title.endswith("*"):
cac_login_required = True
doc_title = doc_title[:-1]
doc_name = doc_name[:-1]

fields = {
'doc_name': doc_name,
'doc_num': doc_num,
'doc_title': doc_title,
'doc_type': doc_type,
'cac_login_required': cac_login_required,
'download_url': download_url,
'publication_date': publication_date
}
## Instantiate DocItem class and assign document's metadata values
doc_item = self.populate_doc_item(fields)

yield doc_item



def populate_doc_item(self, fields):
'''
This functions provides both hardcoded and computed values for the variables
in the imported DocItem object and returns the populated metadata object
'''
display_org="US Navy Medicine" # Level 1: GC app 'Source' filter for docs from this crawler
data_source = "Navy Medicine" # Level 2: GC app 'Source' metadata field for docs from this crawler
source_title = "Unlisted Source" # Level 3 filter

doc_name = fields['doc_name']
doc_num = fields['doc_num']
doc_title = fields['doc_title']
doc_type = fields['doc_type']
cac_login_required = fields['cac_login_required']
download_url = fields['download_url']
publication_date = get_pub_date(fields['publication_date'])

display_doc_type = "Document" # Doc type for display on app
display_source = data_source + " - " + source_title
display_title = doc_type + " " + doc_num + ": " + doc_title
is_revoked = False
source_page_url = self.start_urls[0]
source_fqdn = urlparse(source_page_url).netloc

downloadable_items = [{
"doc_type": "pdf",
"download_url": download_url,
"compression_type": None,
}]

## Assign fields that will be used for versioning
version_hash_fields = {
"doc_name":doc_name,
"doc_num": doc_num,
"publication_date": publication_date,
"download_url": download_url,
"display_title": display_title
}

version_hash = dict_to_sha256_hex_digest(version_hash_fields)

return DocItem(
doc_name = doc_name,
doc_title = doc_title,
doc_num = doc_num,
doc_type = doc_type,
display_doc_type = display_doc_type, #
publication_date = publication_date,
cac_login_required = cac_login_required,
crawler_used = self.name,
downloadable_items = downloadable_items,
source_page_url = source_page_url, #
source_fqdn = source_fqdn, #
download_url = download_url, #
version_hash_raw_data = version_hash_fields, #
version_hash = version_hash,
display_org = display_org, #
data_source = data_source, #
source_title = source_title, #
display_source = display_source, #
display_title = display_title, #
file_ext = doc_type, #
is_revoked = is_revoked, #
)
fields = DocItemFields(
doc_name=doc_name,
doc_title=doc_title,
doc_num=doc_num,
doc_type=doc_type,
publication_date=parse_timestamp(publication_date),
cac_login_required=cac_login_required,
source_page_url=self.start_urls[0],
downloadable_items=[
{
"doc_type": "pdf",
"download_url": download_url,
"compression_type": None,
}
],
download_url=download_url,
file_ext="pdf",
display_doc_type="Document", # Doc type for display on app,
)

yield fields.populate_doc_item(
display_org=self.display_org,
data_source=self.data_source,
source_title=self.source_title,
crawler_used=self.name,
)
Loading