-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* new div selection for updated site * refactor and add doc strings --------- Co-authored-by: Matthew Kersting <[email protected]>
- Loading branch information
1 parent
2592e7e
commit ff85a6a
Showing
2 changed files
with
140 additions
and
145 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
271 changes: 132 additions & 139 deletions
271
dataPipelines/gc_scrapy/gc_scrapy/spiders/army_g1_spider.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,175 +1,168 @@ | ||
import scrapy | ||
from typing import Any, Generator, Union | ||
import re | ||
import time | ||
from urllib.parse import urljoin, urlparse | ||
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem | ||
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider | ||
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date | ||
from datetime import datetime | ||
from bs4 import BeautifulSoup | ||
import json | ||
import html | ||
from datetime import datetime | ||
import scrapy | ||
|
||
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider | ||
from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields | ||
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem | ||
|
||
|
||
class ArmyG1Spider(GCSpider): | ||
name = 'army_g1_pubs' | ||
start_urls = ['https://www.army.mil/g-1#org-g-1-publications'] | ||
""" | ||
As of 05/01/2024 | ||
crawls https://www.army.mil/g-1#org-g-1-publications for 122 pdfs (doc_type = DA PAM) | ||
""" | ||
|
||
# Crawler name | ||
name = "army_g1_pubs" | ||
# Level 1: GC app 'Source' filter for docs from this crawler | ||
display_org = "Dept. of the Army" | ||
# Level 2: GC app 'Source' metadata field for docs from this crawler | ||
data_source = "Army Publishing Directorate" | ||
# Level 3 filter | ||
source_title = "G-1 Publications" | ||
|
||
start_urls = ["https://www.army.mil/g-1#org-g-1-publications"] | ||
rotate_user_agent = True | ||
randomly_delay_request = True | ||
custom_settings = { | ||
**GCSpider.custom_settings, | ||
"DOWNLOAD_DELAY": 5, | ||
"AUTOTHROTTLE_ENABLED": True, | ||
"AUTOTHROTTLE_ENABLED": True, | ||
"AUTOTHROTTLE_START_DELAY": 1, | ||
"AUTOTHROTTLE_MAX_DELAY": 10, | ||
"CONCURRENT_REQUESTS_PER_DOMAIN": 2, | ||
} | ||
|
||
def encoding(self, text): | ||
@staticmethod | ||
def is_ascii_encoded(text: str) -> bool: | ||
"""Returns true if text is ascii encoded""" | ||
try: | ||
text.encode('ascii') | ||
text.encode("ascii") | ||
return False | ||
except UnicodeEncodeError: | ||
return True | ||
|
||
def extract_doc_name_from_url(self, url): | ||
doc_name = url.split('/')[-1].split('.')[0] | ||
@staticmethod | ||
def extract_doc_name_from_url(url: str) -> str: | ||
"""Parses doc name out of url""" | ||
doc_name = url.split("/")[-1].split(".")[0] | ||
return doc_name | ||
|
||
def extract_doc_number(self, doc_number): | ||
pattern = r'(\d{2,4}-\d{1,4})' | ||
match = re.search(pattern, doc_number) | ||
@staticmethod | ||
def extract_doc_number(text: str): | ||
"""Uses regex to pull doc number from container label""" | ||
pattern = r"(\d{2,4}-\d{1,4})" | ||
match = re.search(pattern, text) | ||
if match: | ||
return match.group(1) | ||
else: | ||
return 'N/A' | ||
return "N/A" | ||
|
||
def title_edge_cases(self, text, label): | ||
# renames documents if incorrect on website | ||
@staticmethod | ||
def title_edge_cases(text: str, label: str) -> str: | ||
"""Renames documents if incorrect on website""" | ||
if "Board Brief; NCO Evaluation Board Supplement" in text: | ||
return (label + " Board Brief") | ||
elif "NCO Evaluation Board Supplement" in text: | ||
return label + " Board Brief" | ||
if "NCO Evaluation Board Supplement" in text: | ||
return label | ||
elif text.endswith('.pdf') or text.endswith('docx'): | ||
if text.endswith(".pdf") or text.endswith("docx"): | ||
return label | ||
else: | ||
pattern = r'(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}' | ||
cleaned_text = re.sub(pattern, '', text) | ||
stripped_text = cleaned_text.strip() | ||
if "\\xc2\\xa0" in stripped_text: | ||
stripped_text = stripped_text.replace("\\xc2\\xa0", " ") | ||
decoded_text = html.unescape(stripped_text) | ||
return decoded_text | ||
|
||
def extract_date_from_url(self, url): | ||
pattern = r'(\d{4}/\d{2}/\d{2})' | ||
match = re.search(pattern, url) | ||
pattern = r"(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}" | ||
cleaned_text = re.sub(pattern, "", text) | ||
stripped_text = cleaned_text.strip() | ||
if "\\xc2\\xa0" in stripped_text: | ||
stripped_text = stripped_text.replace("\\xc2\\xa0", " ") | ||
decoded_text = html.unescape(stripped_text) | ||
return decoded_text | ||
|
||
@staticmethod | ||
def extract_date_from_url(url: str) -> Union[datetime, str]: | ||
"""Accepts url then parses and returns a datetime object""" | ||
pattern = r"(\d{4}/\d{2}/\d{2})" | ||
match = re.search(pattern, url) | ||
if match: | ||
date = match.group(1) | ||
datetime_ = datetime.strptime(date, "%Y/%m/%d") | ||
return datetime_.strftime("%m-%d-%Y") | ||
else: | ||
return "Unknown" | ||
|
||
|
||
def parse(self, response): | ||
for container in response.css('.inner-container'): | ||
return datetime_ | ||
return "Unknown" | ||
|
||
def parse_anchor_tag( | ||
self, link: str, text: str, label_text: str, container_label: str, url: str | ||
) -> Generator[DocItem, Any, None]: | ||
"""Takes in data from anchor tag element and returns the DocItem""" | ||
# only consider links that lead to documents | ||
if link.endswith(".pdf") or link.endswith(".docx"): | ||
# check if title needs to be encoded before conversion to string | ||
if self.is_ascii_encoded(text): | ||
text = str(text.encode("utf-8"))[2:-1] | ||
|
||
# clean data for `fields` dictionary | ||
doc_title = self.title_edge_cases(text, label_text) | ||
doc_number = self.extract_doc_number(container_label) | ||
doc_name = self.extract_doc_name_from_url(link) | ||
publication_date = self.extract_date_from_url(link) | ||
# 'pdf' if link.endswith('.pdf'), 'docx' if link.endswith('.docx'), else None | ||
file_type = self.get_href_file_extension(link) | ||
|
||
downloadable_items = [ | ||
{ | ||
"doc_type": file_type, | ||
"download_url": link, | ||
"compression_type": None, | ||
} | ||
] | ||
fields = DocItemFields( | ||
doc_name=doc_name, | ||
doc_title=doc_title, | ||
doc_num=doc_number, | ||
doc_type="DA PAM", | ||
display_doc_type="DA PAM", | ||
publication_date=publication_date, | ||
cac_login_required=False, | ||
source_page_url=url, | ||
downloadable_items=downloadable_items, | ||
download_url=link, | ||
file_ext=file_type, | ||
) | ||
# backwards compatability by setting display_title to doc_title in hash fields | ||
fields.set_version_hash_field("display_title", fields.doc_title) | ||
|
||
yield fields.populate_doc_item( | ||
display_org=self.display_org, | ||
data_source=self.data_source, | ||
source_title=self.source_title, | ||
crawler_used=self.name, | ||
) | ||
|
||
def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]: | ||
"""Parses doc items out of Army G1 Publications site""" | ||
for container in response.css(".inner-container"): | ||
# title of each section | ||
container_label = container.css('h4::text').extract_first() | ||
container_label = container.css("h4::text").extract_first() | ||
|
||
for accordion in container.css(".accordion-container"): | ||
|
||
for accordion in container.css('.accordion-container'): | ||
for item in accordion.css(".accordion"): | ||
|
||
for item in accordion.css('.accordion li'): | ||
# get title text *within* each accordion tab | ||
label_text = item.css("label[for]::text").get().strip() | ||
|
||
# get title text *within* each accordion tab | ||
label_text = item.css('label[for]::text').get().strip() | ||
|
||
# convert html to string | ||
soup = BeautifulSoup(item.get(), 'html.parser') | ||
div_tag = soup.find('div', class_='rich-text-element bodytext') | ||
|
||
if div_tag: | ||
delta_data = div_tag.get('data-delta') | ||
if delta_data: | ||
# parse delta_data as JSON | ||
data = json.loads(delta_data) | ||
for op in data["ops"]: | ||
if 'attributes' in op and 'link' in op['attributes']: | ||
# URL link | ||
link = op['attributes']['link'] | ||
|
||
# only consider links that lead to documents | ||
if link.endswith('.pdf') or link.endswith('.docx'): | ||
# extract title | ||
text = op['insert'] | ||
|
||
# check if title needs to be encoded before conversion to string | ||
if self.encoding(text): | ||
text = str(text.encode('utf-8'))[2:-1] | ||
|
||
# clean data for `fields` dictionary | ||
doc_title = self.title_edge_cases(text, label_text) | ||
doc_number = self.extract_doc_number(container_label) | ||
doc_name = self.extract_doc_name_from_url(link) | ||
publication_date = self.extract_date_from_url(link) | ||
#file_type = 'pdf' if link.endswith('.pdf') else ('docx' if link.endswith('.docx') else None) | ||
file_type = self.get_href_file_extension(link) | ||
|
||
fields = { | ||
'doc_name': doc_name, | ||
'doc_num': doc_number, | ||
'doc_title': doc_title, | ||
'doc_type': "DA PAM", | ||
'display_doc_type': "DA PAM", | ||
'file_type': file_type, | ||
'download_url': link, | ||
'source_page_url': response.url, | ||
'publication_date': publication_date, | ||
'cac_login_required': False, | ||
'is_revoked': False | ||
} | ||
|
||
doc_item = self.populate_doc_item(fields) | ||
yield doc_item | ||
|
||
def populate_doc_item(self, fields): | ||
display_org = "Dept. of the Army" | ||
data_source = "Army Publishing Directorate" | ||
source_title = "G-1 Publications" | ||
|
||
version_hash_fields = { | ||
"doc_name": fields['doc_name'], | ||
"doc_num": fields['doc_num'], | ||
"publication_date": get_pub_date(fields['publication_date']), | ||
"download_url": fields['download_url'], | ||
"display_title": fields['doc_title'] | ||
} | ||
|
||
version_hash = dict_to_sha256_hex_digest(version_hash_fields) | ||
|
||
return DocItem( | ||
doc_name=fields['doc_name'], | ||
doc_title=fields['doc_title'], | ||
doc_num=fields['doc_num'], | ||
doc_type=fields['doc_type'], | ||
display_doc_type=fields['display_doc_type'], | ||
publication_date=get_pub_date(fields['publication_date']), | ||
cac_login_required=fields['cac_login_required'], | ||
crawler_used=self.name, | ||
downloadable_items=[{ | ||
"doc_type": fields['file_type'], | ||
"download_url": fields['download_url'], | ||
"compression_type": None | ||
}], | ||
source_page_url=fields['source_page_url'], | ||
source_fqdn=urlparse(fields['source_page_url']).netloc, | ||
download_url=fields['download_url'], | ||
version_hash_raw_data=version_hash_fields, | ||
version_hash=version_hash, | ||
display_org=display_org, | ||
data_source=data_source, | ||
source_title=source_title, | ||
display_source=data_source + " - " + source_title, | ||
display_title=fields['doc_type'] + " " + fields['doc_num'] + ": " + fields['doc_title'], | ||
file_ext=fields['file_type'], # 'pdf' | ||
is_revoked=fields['is_revoked'] | ||
) | ||
soup = BeautifulSoup(item.get(), "html.parser") | ||
div_tag = soup.find("div", class_="rich-text-element bodytext") | ||
|
||
if div_tag is None: | ||
continue | ||
# Find all anchor tags | ||
anchor_tags = soup.find_all("a") | ||
|
||
# Extract URLs and text | ||
for tag in anchor_tags: | ||
link = tag["href"] | ||
text = tag.get_text() | ||
yield from self.parse_anchor_tag( | ||
link, text, label_text, container_label, response.url | ||
) |