Skip to content

Commit

Permalink
Merge pull request #237 from dod-advana/patch-stale-spiders
Browse files Browse the repository at this point in the history
Patch tradoc and national_guard Spiders
  • Loading branch information
emmarez committed Jul 16, 2024
2 parents 4de0bdd + ebbb328 commit aa7da0c
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 127 deletions.
Original file line number Diff line number Diff line change
@@ -1,66 +1,70 @@
from urllib.parse import urlparse

from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date
from urllib.parse import urlparse
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest


class CNGBISpider(GCSpider):
"""
Parser for Chief National Guard Bureau Instructions
Parser for Chief National Guard Bureau Instructions
"""

name = "National_Guard" # Crawler name
display_org = "National Guard" # Level 1: GC app 'Source' filter for docs from this crawler
data_source = "National Guard Bureau Publications & Forms Library" # Level 2: GC app 'Source' metadata field for docs from this crawler
source_title = "Unlisted Source" # Level 3 filter
# Crawler name
name = "National_Guard"
# Level 1: GC app 'Source' filter for docs from this crawler
display_org = "National Guard"
# Level 2: GC app 'Source' metadata field for docs from this crawler
data_source = "National Guard Bureau Publications & Forms Library"
# Level 3 filter
source_title = "Unlisted Source"
display_source = data_source + " - " + source_title

allowed_domains = ['ngbpmc.ng.mil']
start_urls = [
'https://www.ngbpmc.ng.mil/publications1/cngbi/'
]
allowed_domains = ["ngbpmc.ng.mil"]
start_urls = ["https://www.ngbpmc.ng.mil/Publications/CNGB-Instructions/"]

file_type = "pdf"
doc_type = "CNGBI"
rotate_user_agent = True

def parse(self, response):
rows = response.css('div.WordSection1 table tbody tr')
rows = response.css("div.WordSection1 table tbody tr")
for row in rows:
href_raw = row.css('td:nth-child(1) a::attr(href)').get()
href_raw = row.css("td:nth-child(1) a::attr(href)").get()

if not href_raw.startswith('/'):
if not href_raw.startswith("/"):
cac_login_required = True
else:
cac_login_required = False

web_url = self.ensure_full_href_url(href_raw, self.start_urls[0])

file_type = self.get_href_file_extension(href_raw)
web_url = web_url.replace(' ', '%20')
web_url = web_url.replace(" ", "%20")
downloadable_items = [
{
"doc_type": file_type,
"download_url": web_url,
"compression_type": None
"compression_type": None,
}
]

# a lot of the docs have the space unicode \xa0 in them. replacing it before getting doc_num
doc_name_raw = row.css('td:nth-child(1) a::text')
# a lot of docs have the space unicode \xa0 which need replaced before getting doc_num
doc_name_raw = row.css("td:nth-child(1) a::text")
if doc_name_raw:
doc_name_raw = doc_name_raw.get().replace(u'\xa0', ' ')
doc_name_raw = doc_name_raw.get().replace("\xa0", " ")
else:
continue

doc_num_raw = doc_name_raw.replace('CNGBI ', '')
doc_num_raw = doc_name_raw.replace("CNGBI ", "")

publication_date = row.css('td:nth-child(2) span::text').get()
publication_date = row.css("td:nth-child(2) span::text").get()

doc_title_raw = row.css('td:nth-child(3) a::text').get()
doc_title_raw = row.css("td:nth-child(3) a::text").get()
if doc_title_raw is None:
doc_title_raw = row.css('td:nth-child(3) span::text').get()
doc_title_raw = row.css("td:nth-child(3) span::text").get()
if doc_title_raw is None:
doc_title_raw = row.css('td:nth-child(3) font::text').get()
doc_title_raw = row.css("td:nth-child(3) font::text").get()
if doc_title_raw is None:
print("uh oh")

Expand All @@ -72,7 +76,7 @@ def parse(self, response):
version_hash_fields = {
"item_currency": href_raw,
"document_title": doc_title,
"document_number": doc_num_raw
"document_number": doc_num_raw,
}
version_hash = dict_to_sha256_hex_digest(version_hash_fields)

Expand All @@ -99,4 +103,3 @@ def parse(self, response):
file_ext=self.file_type,
is_revoked=False,
)

Loading

0 comments on commit aa7da0c

Please sign in to comment.