Skip to content

Commit

Permalink
Patch army g1 spider (#228)
Browse files Browse the repository at this point in the history
* new div selection for updated site

* refactor and add doc strings

---------

Co-authored-by: Matthew Kersting <[email protected]>
  • Loading branch information
matthew-kersting and Matthew Kersting committed May 8, 2024
1 parent 2592e7e commit ff85a6a
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 145 deletions.
14 changes: 8 additions & 6 deletions dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,22 @@ def __init__(
self.file_ext = file_ext
self.display_title = doc_type + " " + doc_num + ": " + doc_title

def get_version_hash_fields(self) -> dict:
"""Returns a dict of the fields used for hashing"""
return {
self.hash_fields = {
"doc_name": self.doc_name,
"doc_num": self.doc_num,
"publication_date": self.publication_date,
"download_url": self.download_url,
"display_title": self.display_title,
}

def set_version_hash_field(self, key: str, value: str) -> dict:
"""Sets a new field or updates an old one in the dict used for hashing"""
self.hash_fields[key] = value

def set_display_name(self, name: str) -> None:
"""Update display name for DocItemFields instance"""
self.display_title = name
self.hash_fields["display_title"] = name

def populate_doc_item(
self, display_org: str, data_source: str, source_title: str, crawler_used: str
Expand All @@ -70,8 +73,7 @@ def populate_doc_item(
display_source = data_source + " - " + source_title
is_revoked = False
source_fqdn = urlparse(self.source_page_url).netloc
version_hash_fields = self.get_version_hash_fields()
version_hash = dict_to_sha256_hex_digest(version_hash_fields)
version_hash = dict_to_sha256_hex_digest(self.hash_fields)

return DocItem(
doc_name=self.doc_name,
Expand All @@ -86,7 +88,7 @@ def populate_doc_item(
source_page_url=self.source_page_url,
source_fqdn=source_fqdn,
download_url=self.download_url,
version_hash_raw_data=version_hash_fields,
version_hash_raw_data=self.hash_fields,
version_hash=version_hash,
display_org=display_org,
data_source=data_source,
Expand Down
271 changes: 132 additions & 139 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/army_g1_spider.py
Original file line number Diff line number Diff line change
@@ -1,175 +1,168 @@
import scrapy
from typing import Any, Generator, Union
import re
import time
from urllib.parse import urljoin, urlparse
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date
from datetime import datetime
from bs4 import BeautifulSoup
import json
import html
from datetime import datetime
import scrapy

from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem


class ArmyG1Spider(GCSpider):
name = 'army_g1_pubs'
start_urls = ['https://www.army.mil/g-1#org-g-1-publications']
"""
As of 05/01/2024
crawls https://www.army.mil/g-1#org-g-1-publications for 122 pdfs (doc_type = DA PAM)
"""

# Crawler name
name = "army_g1_pubs"
# Level 1: GC app 'Source' filter for docs from this crawler
display_org = "Dept. of the Army"
# Level 2: GC app 'Source' metadata field for docs from this crawler
data_source = "Army Publishing Directorate"
# Level 3 filter
source_title = "G-1 Publications"

start_urls = ["https://www.army.mil/g-1#org-g-1-publications"]
rotate_user_agent = True
randomly_delay_request = True
custom_settings = {
**GCSpider.custom_settings,
"DOWNLOAD_DELAY": 5,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_START_DELAY": 1,
"AUTOTHROTTLE_MAX_DELAY": 10,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2,
}

def encoding(self, text):
@staticmethod
def is_ascii_encoded(text: str) -> bool:
"""Returns true if text is ascii encoded"""
try:
text.encode('ascii')
text.encode("ascii")
return False
except UnicodeEncodeError:
return True

def extract_doc_name_from_url(self, url):
doc_name = url.split('/')[-1].split('.')[0]
@staticmethod
def extract_doc_name_from_url(url: str) -> str:
"""Parses doc name out of url"""
doc_name = url.split("/")[-1].split(".")[0]
return doc_name

def extract_doc_number(self, doc_number):
pattern = r'(\d{2,4}-\d{1,4})'
match = re.search(pattern, doc_number)
@staticmethod
def extract_doc_number(text: str):
"""Uses regex to pull doc number from container label"""
pattern = r"(\d{2,4}-\d{1,4})"
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return 'N/A'
return "N/A"

def title_edge_cases(self, text, label):
# renames documents if incorrect on website
@staticmethod
def title_edge_cases(text: str, label: str) -> str:
"""Renames documents if incorrect on website"""
if "Board Brief; NCO Evaluation Board Supplement" in text:
return (label + " Board Brief")
elif "NCO Evaluation Board Supplement" in text:
return label + " Board Brief"
if "NCO Evaluation Board Supplement" in text:
return label
elif text.endswith('.pdf') or text.endswith('docx'):
if text.endswith(".pdf") or text.endswith("docx"):
return label
else:
pattern = r'(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}'
cleaned_text = re.sub(pattern, '', text)
stripped_text = cleaned_text.strip()
if "\\xc2\\xa0" in stripped_text:
stripped_text = stripped_text.replace("\\xc2\\xa0", " ")
decoded_text = html.unescape(stripped_text)
return decoded_text

def extract_date_from_url(self, url):
pattern = r'(\d{4}/\d{2}/\d{2})'
match = re.search(pattern, url)
pattern = r"(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}"
cleaned_text = re.sub(pattern, "", text)
stripped_text = cleaned_text.strip()
if "\\xc2\\xa0" in stripped_text:
stripped_text = stripped_text.replace("\\xc2\\xa0", " ")
decoded_text = html.unescape(stripped_text)
return decoded_text

@staticmethod
def extract_date_from_url(url: str) -> Union[datetime, str]:
"""Accepts url then parses and returns a datetime object"""
pattern = r"(\d{4}/\d{2}/\d{2})"
match = re.search(pattern, url)
if match:
date = match.group(1)
datetime_ = datetime.strptime(date, "%Y/%m/%d")
return datetime_.strftime("%m-%d-%Y")
else:
return "Unknown"


def parse(self, response):
for container in response.css('.inner-container'):
return datetime_
return "Unknown"

def parse_anchor_tag(
self, link: str, text: str, label_text: str, container_label: str, url: str
) -> Generator[DocItem, Any, None]:
"""Takes in data from anchor tag element and returns the DocItem"""
# only consider links that lead to documents
if link.endswith(".pdf") or link.endswith(".docx"):
# check if title needs to be encoded before conversion to string
if self.is_ascii_encoded(text):
text = str(text.encode("utf-8"))[2:-1]

# clean data for `fields` dictionary
doc_title = self.title_edge_cases(text, label_text)
doc_number = self.extract_doc_number(container_label)
doc_name = self.extract_doc_name_from_url(link)
publication_date = self.extract_date_from_url(link)
# 'pdf' if link.endswith('.pdf'), 'docx' if link.endswith('.docx'), else None
file_type = self.get_href_file_extension(link)

downloadable_items = [
{
"doc_type": file_type,
"download_url": link,
"compression_type": None,
}
]
fields = DocItemFields(
doc_name=doc_name,
doc_title=doc_title,
doc_num=doc_number,
doc_type="DA PAM",
display_doc_type="DA PAM",
publication_date=publication_date,
cac_login_required=False,
source_page_url=url,
downloadable_items=downloadable_items,
download_url=link,
file_ext=file_type,
)
# backwards compatability by setting display_title to doc_title in hash fields
fields.set_version_hash_field("display_title", fields.doc_title)

yield fields.populate_doc_item(
display_org=self.display_org,
data_source=self.data_source,
source_title=self.source_title,
crawler_used=self.name,
)

def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]:
"""Parses doc items out of Army G1 Publications site"""
for container in response.css(".inner-container"):
# title of each section
container_label = container.css('h4::text').extract_first()
container_label = container.css("h4::text").extract_first()

for accordion in container.css(".accordion-container"):

for accordion in container.css('.accordion-container'):
for item in accordion.css(".accordion"):

for item in accordion.css('.accordion li'):
# get title text *within* each accordion tab
label_text = item.css("label[for]::text").get().strip()

# get title text *within* each accordion tab
label_text = item.css('label[for]::text').get().strip()

# convert html to string
soup = BeautifulSoup(item.get(), 'html.parser')
div_tag = soup.find('div', class_='rich-text-element bodytext')

if div_tag:
delta_data = div_tag.get('data-delta')
if delta_data:
# parse delta_data as JSON
data = json.loads(delta_data)
for op in data["ops"]:
if 'attributes' in op and 'link' in op['attributes']:
# URL link
link = op['attributes']['link']

# only consider links that lead to documents
if link.endswith('.pdf') or link.endswith('.docx'):
# extract title
text = op['insert']

# check if title needs to be encoded before conversion to string
if self.encoding(text):
text = str(text.encode('utf-8'))[2:-1]

# clean data for `fields` dictionary
doc_title = self.title_edge_cases(text, label_text)
doc_number = self.extract_doc_number(container_label)
doc_name = self.extract_doc_name_from_url(link)
publication_date = self.extract_date_from_url(link)
#file_type = 'pdf' if link.endswith('.pdf') else ('docx' if link.endswith('.docx') else None)
file_type = self.get_href_file_extension(link)

fields = {
'doc_name': doc_name,
'doc_num': doc_number,
'doc_title': doc_title,
'doc_type': "DA PAM",
'display_doc_type': "DA PAM",
'file_type': file_type,
'download_url': link,
'source_page_url': response.url,
'publication_date': publication_date,
'cac_login_required': False,
'is_revoked': False
}

doc_item = self.populate_doc_item(fields)
yield doc_item

def populate_doc_item(self, fields):
display_org = "Dept. of the Army"
data_source = "Army Publishing Directorate"
source_title = "G-1 Publications"

version_hash_fields = {
"doc_name": fields['doc_name'],
"doc_num": fields['doc_num'],
"publication_date": get_pub_date(fields['publication_date']),
"download_url": fields['download_url'],
"display_title": fields['doc_title']
}

version_hash = dict_to_sha256_hex_digest(version_hash_fields)

return DocItem(
doc_name=fields['doc_name'],
doc_title=fields['doc_title'],
doc_num=fields['doc_num'],
doc_type=fields['doc_type'],
display_doc_type=fields['display_doc_type'],
publication_date=get_pub_date(fields['publication_date']),
cac_login_required=fields['cac_login_required'],
crawler_used=self.name,
downloadable_items=[{
"doc_type": fields['file_type'],
"download_url": fields['download_url'],
"compression_type": None
}],
source_page_url=fields['source_page_url'],
source_fqdn=urlparse(fields['source_page_url']).netloc,
download_url=fields['download_url'],
version_hash_raw_data=version_hash_fields,
version_hash=version_hash,
display_org=display_org,
data_source=data_source,
source_title=source_title,
display_source=data_source + " - " + source_title,
display_title=fields['doc_type'] + " " + fields['doc_num'] + ": " + fields['doc_title'],
file_ext=fields['file_type'], # 'pdf'
is_revoked=fields['is_revoked']
)
soup = BeautifulSoup(item.get(), "html.parser")
div_tag = soup.find("div", class_="rich-text-element bodytext")

if div_tag is None:
continue
# Find all anchor tags
anchor_tags = soup.find_all("a")

# Extract URLs and text
for tag in anchor_tags:
link = tag["href"]
text = tag.get_text()
yield from self.parse_anchor_tag(
link, text, label_text, container_label, response.url
)

0 comments on commit ff85a6a

Please sign in to comment.