Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch army g1 spider #228

Merged
merged 2 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,22 @@ def __init__(
self.file_ext = file_ext
self.display_title = doc_type + " " + doc_num + ": " + doc_title

def get_version_hash_fields(self) -> dict:
"""Returns a dict of the fields used for hashing"""
return {
self.hash_fields = {
"doc_name": self.doc_name,
"doc_num": self.doc_num,
"publication_date": self.publication_date,
"download_url": self.download_url,
"display_title": self.display_title,
}

def set_version_hash_field(self, key: str, value: str) -> dict:
"""Sets a new field or updates an old one in the dict used for hashing"""
self.hash_fields[key] = value

def set_display_name(self, name: str) -> None:
"""Update display name for DocItemFields instance"""
self.display_title = name
self.hash_fields["display_title"] = name

def populate_doc_item(
self, display_org: str, data_source: str, source_title: str, crawler_used: str
Expand All @@ -70,8 +73,7 @@ def populate_doc_item(
display_source = data_source + " - " + source_title
is_revoked = False
source_fqdn = urlparse(self.source_page_url).netloc
version_hash_fields = self.get_version_hash_fields()
version_hash = dict_to_sha256_hex_digest(version_hash_fields)
version_hash = dict_to_sha256_hex_digest(self.hash_fields)

return DocItem(
doc_name=self.doc_name,
Expand All @@ -86,7 +88,7 @@ def populate_doc_item(
source_page_url=self.source_page_url,
source_fqdn=source_fqdn,
download_url=self.download_url,
version_hash_raw_data=version_hash_fields,
version_hash_raw_data=self.hash_fields,
version_hash=version_hash,
display_org=display_org,
data_source=data_source,
Expand Down
271 changes: 132 additions & 139 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/army_g1_spider.py
Original file line number Diff line number Diff line change
@@ -1,175 +1,168 @@
import scrapy
from typing import Any, Generator, Union
import re
import time
from urllib.parse import urljoin, urlparse
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date
from datetime import datetime
from bs4 import BeautifulSoup
import json
import html
from datetime import datetime
import scrapy

from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem


class ArmyG1Spider(GCSpider):
name = 'army_g1_pubs'
start_urls = ['https://www.army.mil/g-1#org-g-1-publications']
"""
As of 05/01/2024
crawls https://www.army.mil/g-1#org-g-1-publications for 122 pdfs (doc_type = DA PAM)
"""

# Crawler name
name = "army_g1_pubs"
# Level 1: GC app 'Source' filter for docs from this crawler
display_org = "Dept. of the Army"
# Level 2: GC app 'Source' metadata field for docs from this crawler
data_source = "Army Publishing Directorate"
# Level 3 filter
source_title = "G-1 Publications"

start_urls = ["https://www.army.mil/g-1#org-g-1-publications"]
rotate_user_agent = True
randomly_delay_request = True
custom_settings = {
**GCSpider.custom_settings,
"DOWNLOAD_DELAY": 5,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_START_DELAY": 1,
"AUTOTHROTTLE_MAX_DELAY": 10,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2,
}

def encoding(self, text):
@staticmethod
def is_ascii_encoded(text: str) -> bool:
"""Returns true if text is ascii encoded"""
try:
text.encode('ascii')
text.encode("ascii")
return False
except UnicodeEncodeError:
return True

def extract_doc_name_from_url(self, url):
doc_name = url.split('/')[-1].split('.')[0]
@staticmethod
def extract_doc_name_from_url(url: str) -> str:
"""Parses doc name out of url"""
doc_name = url.split("/")[-1].split(".")[0]
return doc_name

def extract_doc_number(self, doc_number):
pattern = r'(\d{2,4}-\d{1,4})'
match = re.search(pattern, doc_number)
@staticmethod
def extract_doc_number(text: str):
"""Uses regex to pull doc number from container label"""
pattern = r"(\d{2,4}-\d{1,4})"
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return 'N/A'
return "N/A"

def title_edge_cases(self, text, label):
# renames documents if incorrect on website
@staticmethod
def title_edge_cases(text: str, label: str) -> str:
"""Renames documents if incorrect on website"""
if "Board Brief; NCO Evaluation Board Supplement" in text:
return (label + " Board Brief")
elif "NCO Evaluation Board Supplement" in text:
return label + " Board Brief"
if "NCO Evaluation Board Supplement" in text:
return label
elif text.endswith('.pdf') or text.endswith('docx'):
if text.endswith(".pdf") or text.endswith("docx"):
return label
else:
pattern = r'(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}'
cleaned_text = re.sub(pattern, '', text)
stripped_text = cleaned_text.strip()
if "\\xc2\\xa0" in stripped_text:
stripped_text = stripped_text.replace("\\xc2\\xa0", " ")
decoded_text = html.unescape(stripped_text)
return decoded_text

def extract_date_from_url(self, url):
pattern = r'(\d{4}/\d{2}/\d{2})'
match = re.search(pattern, url)
pattern = r"(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}"
cleaned_text = re.sub(pattern, "", text)
stripped_text = cleaned_text.strip()
if "\\xc2\\xa0" in stripped_text:
stripped_text = stripped_text.replace("\\xc2\\xa0", " ")
decoded_text = html.unescape(stripped_text)
return decoded_text

@staticmethod
def extract_date_from_url(url: str) -> Union[datetime, str]:
"""Accepts url then parses and returns a datetime object"""
pattern = r"(\d{4}/\d{2}/\d{2})"
match = re.search(pattern, url)
if match:
date = match.group(1)
datetime_ = datetime.strptime(date, "%Y/%m/%d")
return datetime_.strftime("%m-%d-%Y")
else:
return "Unknown"


def parse(self, response):
for container in response.css('.inner-container'):
return datetime_
return "Unknown"

def parse_anchor_tag(
self, link: str, text: str, label_text: str, container_label: str, url: str
) -> Generator[DocItem, Any, None]:
"""Takes in data from anchor tag element and returns the DocItem"""
# only consider links that lead to documents
if link.endswith(".pdf") or link.endswith(".docx"):
# check if title needs to be encoded before conversion to string
if self.is_ascii_encoded(text):
text = str(text.encode("utf-8"))[2:-1]

# clean data for `fields` dictionary
doc_title = self.title_edge_cases(text, label_text)
doc_number = self.extract_doc_number(container_label)
doc_name = self.extract_doc_name_from_url(link)
publication_date = self.extract_date_from_url(link)
# 'pdf' if link.endswith('.pdf'), 'docx' if link.endswith('.docx'), else None
file_type = self.get_href_file_extension(link)

downloadable_items = [
{
"doc_type": file_type,
"download_url": link,
"compression_type": None,
}
]
fields = DocItemFields(
doc_name=doc_name,
doc_title=doc_title,
doc_num=doc_number,
doc_type="DA PAM",
display_doc_type="DA PAM",
publication_date=publication_date,
cac_login_required=False,
source_page_url=url,
downloadable_items=downloadable_items,
download_url=link,
file_ext=file_type,
)
# backwards compatability by setting display_title to doc_title in hash fields
fields.set_version_hash_field("display_title", fields.doc_title)

yield fields.populate_doc_item(
display_org=self.display_org,
data_source=self.data_source,
source_title=self.source_title,
crawler_used=self.name,
)

def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]:
"""Parses doc items out of Army G1 Publications site"""
for container in response.css(".inner-container"):
# title of each section
container_label = container.css('h4::text').extract_first()
container_label = container.css("h4::text").extract_first()

for accordion in container.css(".accordion-container"):

for accordion in container.css('.accordion-container'):
for item in accordion.css(".accordion"):

for item in accordion.css('.accordion li'):
# get title text *within* each accordion tab
label_text = item.css("label[for]::text").get().strip()

# get title text *within* each accordion tab
label_text = item.css('label[for]::text').get().strip()

# convert html to string
soup = BeautifulSoup(item.get(), 'html.parser')
div_tag = soup.find('div', class_='rich-text-element bodytext')

if div_tag:
delta_data = div_tag.get('data-delta')
if delta_data:
# parse delta_data as JSON
data = json.loads(delta_data)
for op in data["ops"]:
if 'attributes' in op and 'link' in op['attributes']:
# URL link
link = op['attributes']['link']

# only consider links that lead to documents
if link.endswith('.pdf') or link.endswith('.docx'):
# extract title
text = op['insert']

# check if title needs to be encoded before conversion to string
if self.encoding(text):
text = str(text.encode('utf-8'))[2:-1]

# clean data for `fields` dictionary
doc_title = self.title_edge_cases(text, label_text)
doc_number = self.extract_doc_number(container_label)
doc_name = self.extract_doc_name_from_url(link)
publication_date = self.extract_date_from_url(link)
#file_type = 'pdf' if link.endswith('.pdf') else ('docx' if link.endswith('.docx') else None)
file_type = self.get_href_file_extension(link)

fields = {
'doc_name': doc_name,
'doc_num': doc_number,
'doc_title': doc_title,
'doc_type': "DA PAM",
'display_doc_type': "DA PAM",
'file_type': file_type,
'download_url': link,
'source_page_url': response.url,
'publication_date': publication_date,
'cac_login_required': False,
'is_revoked': False
}

doc_item = self.populate_doc_item(fields)
yield doc_item

def populate_doc_item(self, fields):
display_org = "Dept. of the Army"
data_source = "Army Publishing Directorate"
source_title = "G-1 Publications"

version_hash_fields = {
"doc_name": fields['doc_name'],
"doc_num": fields['doc_num'],
"publication_date": get_pub_date(fields['publication_date']),
"download_url": fields['download_url'],
"display_title": fields['doc_title']
}

version_hash = dict_to_sha256_hex_digest(version_hash_fields)

return DocItem(
doc_name=fields['doc_name'],
doc_title=fields['doc_title'],
doc_num=fields['doc_num'],
doc_type=fields['doc_type'],
display_doc_type=fields['display_doc_type'],
publication_date=get_pub_date(fields['publication_date']),
cac_login_required=fields['cac_login_required'],
crawler_used=self.name,
downloadable_items=[{
"doc_type": fields['file_type'],
"download_url": fields['download_url'],
"compression_type": None
}],
source_page_url=fields['source_page_url'],
source_fqdn=urlparse(fields['source_page_url']).netloc,
download_url=fields['download_url'],
version_hash_raw_data=version_hash_fields,
version_hash=version_hash,
display_org=display_org,
data_source=data_source,
source_title=source_title,
display_source=data_source + " - " + source_title,
display_title=fields['doc_type'] + " " + fields['doc_num'] + ": " + fields['doc_title'],
file_ext=fields['file_type'], # 'pdf'
is_revoked=fields['is_revoked']
)
soup = BeautifulSoup(item.get(), "html.parser")
div_tag = soup.find("div", class_="rich-text-element bodytext")

if div_tag is None:
continue
# Find all anchor tags
anchor_tags = soup.find_all("a")

# Extract URLs and text
for tag in anchor_tags:
link = tag["href"]
text = tag.get_text()
yield from self.parse_anchor_tag(
link, text, label_text, container_label, response.url
)
Loading