Skip to content

Commit

Permalink
catch pdfs with the same doc number
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew Kersting committed Jun 11, 2024
1 parent d0aea9a commit 2ba5576
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/navy_med_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
class NavyMedSpider(GCSeleniumSpider):
"""
As of 05/23/2024
crawls https://www.med.navy.mil/Directives for 398 pdfs total: 318 pdfs (doc_type = BUMEDINST),
22 pdfs(doc_type = BUMEDNOTE) & and 58 pdfs (doc_type = NAVMED)
crawls https://www.med.navy.mil/Directives for 402 pdfs total: 321 pdfs (doc_type = BUMEDINST),
24 pdfs(doc_type = BUMEDNOTE) & and 57 pdfs (doc_type = NAVMED)
Note: BUMEDINST has 322 docs in the table but one requries a CAC
"""

# Crawler name
Expand Down Expand Up @@ -176,15 +177,14 @@ def parse_table(
doc_title = None

# Changes for each tab
# BUMEDINST
if index == 0:
doc_num_raw = doc_num_raw.split()[0]
# BUMEDNOTE
elif index == 1:
if index == 1:
doc_num_raw = doc_num_raw.replace("NOTE ", "")
# BUMEDNOTE has a lot of duplicate nums with completely different docs
if doc_num_raw in bumednote_seen:
doc_num_raw = f"{doc_num_raw} {doc_title_raw}"
if doc_num_raw in bumednote_seen:
doc_num_raw = f"{doc_num_raw}-REVISION"

bumednote_seen.add(doc_num_raw)

Expand Down

0 comments on commit 2ba5576

Please sign in to comment.