Skip to content

Commit

Permalink
refactor and add doc strings
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew Kersting committed May 3, 2024
1 parent ab6743c commit 3c37947
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 80 deletions.
5 changes: 3 additions & 2 deletions dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,17 @@ def __init__(
"doc_num": self.doc_num,
"publication_date": self.publication_date,
"download_url": self.download_url,
"display_title": self.display_title,
"display_title": self.display_title,
}

def set_version_hash_field(self, key: str, value: str) -> dict:
"""Sets a new field or updates an old one in the dict used for hashing"""
self.hash_fields[key] = value

def set_display_name(self, name: str) -> None:
"""Update display name for DocItemFields instance"""
self.display_title = name
self.hash_fields["display_title"] = name

def populate_doc_item(
self, display_org: str, data_source: str, source_title: str, crawler_used: str
Expand Down
172 changes: 94 additions & 78 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/army_g1_spider.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from typing import Any, Generator, Union
import re
from bs4 import BeautifulSoup
import html
from datetime import datetime
import scrapy

from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem


class ArmyG1Spider(GCSpider):
Expand Down Expand Up @@ -34,53 +37,108 @@ class ArmyG1Spider(GCSpider):
"CONCURRENT_REQUESTS_PER_DOMAIN": 2,
}

def encoding(self, text):
@staticmethod
def is_ascii_encoded(text: str) -> bool:
"""Returns true if text is ascii encoded"""
try:
text.encode("ascii")
return False
except UnicodeEncodeError:
return True

def extract_doc_name_from_url(self, url):
@staticmethod
def extract_doc_name_from_url(url: str) -> str:
"""Parses doc name out of url"""
doc_name = url.split("/")[-1].split(".")[0]
return doc_name

def extract_doc_number(self, doc_number):
@staticmethod
def extract_doc_number(text: str):
"""Uses regex to pull doc number from container label"""
pattern = r"(\d{2,4}-\d{1,4})"
match = re.search(pattern, doc_number)
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return "N/A"
return "N/A"

def title_edge_cases(self, text, label):
# renames documents if incorrect on website
@staticmethod
def title_edge_cases(text: str, label: str) -> str:
"""Renames documents if incorrect on website"""
if "Board Brief; NCO Evaluation Board Supplement" in text:
return label + " Board Brief"
elif "NCO Evaluation Board Supplement" in text:
if "NCO Evaluation Board Supplement" in text:
return label
elif text.endswith(".pdf") or text.endswith("docx"):
if text.endswith(".pdf") or text.endswith("docx"):
return label
else:
pattern = r"(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}"
cleaned_text = re.sub(pattern, "", text)
stripped_text = cleaned_text.strip()
if "\\xc2\\xa0" in stripped_text:
stripped_text = stripped_text.replace("\\xc2\\xa0", " ")
decoded_text = html.unescape(stripped_text)
return decoded_text

def extract_date_from_url(self, url):
pattern = r"(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}"
cleaned_text = re.sub(pattern, "", text)
stripped_text = cleaned_text.strip()
if "\\xc2\\xa0" in stripped_text:
stripped_text = stripped_text.replace("\\xc2\\xa0", " ")
decoded_text = html.unescape(stripped_text)
return decoded_text

@staticmethod
def extract_date_from_url(url: str) -> Union[datetime, str]:
"""Accepts url then parses and returns a datetime object"""
pattern = r"(\d{4}/\d{2}/\d{2})"
match = re.search(pattern, url)
if match:
date = match.group(1)
datetime_ = datetime.strptime(date, "%Y/%m/%d")
return datetime_
else:
return "Unknown"

def parse(self, response):
return "Unknown"

def parse_anchor_tag(
self, link: str, text: str, label_text: str, container_label: str, url: str
) -> Generator[DocItem, Any, None]:
"""Takes in data from anchor tag element and returns the DocItem"""
# only consider links that lead to documents
if link.endswith(".pdf") or link.endswith(".docx"):
# check if title needs to be encoded before conversion to string
if self.is_ascii_encoded(text):
text = str(text.encode("utf-8"))[2:-1]

# clean data for `fields` dictionary
doc_title = self.title_edge_cases(text, label_text)
doc_number = self.extract_doc_number(container_label)
doc_name = self.extract_doc_name_from_url(link)
publication_date = self.extract_date_from_url(link)
# 'pdf' if link.endswith('.pdf'), 'docx' if link.endswith('.docx'), else None
file_type = self.get_href_file_extension(link)

downloadable_items = [
{
"doc_type": file_type,
"download_url": link,
"compression_type": None,
}
]
fields = DocItemFields(
doc_name=doc_name,
doc_title=doc_title,
doc_num=doc_number,
doc_type="DA PAM",
display_doc_type="DA PAM",
publication_date=publication_date,
cac_login_required=False,
source_page_url=url,
downloadable_items=downloadable_items,
download_url=link,
file_ext=file_type,
)
# backwards compatability by setting display_title to doc_title in hash fields
fields.set_version_hash_field("display_title", fields.doc_title)

yield fields.populate_doc_item(
display_org=self.display_org,
data_source=self.data_source,
source_title=self.source_title,
crawler_used=self.name,
)

def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]:
"""Parses doc items out of Army G1 Publications site"""
for container in response.css(".inner-container"):
# title of each section
container_label = container.css("h4::text").extract_first()
Expand All @@ -96,57 +154,15 @@ def parse(self, response):
soup = BeautifulSoup(item.get(), "html.parser")
div_tag = soup.find("div", class_="rich-text-element bodytext")

if div_tag:
# Find all anchor tags
anchor_tags = soup.find_all("a")

# Extract URLs and text
for tag in anchor_tags:
link = tag["href"]
text = tag.get_text()

# only consider links that lead to documents
if link.endswith(".pdf") or link.endswith(".docx"):
# check if title needs to be encoded before conversion to string
if self.encoding(text):
text = str(text.encode("utf-8"))[2:-1]

# clean data for `fields` dictionary
doc_title = self.title_edge_cases(text, label_text)
doc_number = self.extract_doc_number(container_label)
doc_name = self.extract_doc_name_from_url(link)
publication_date = self.extract_date_from_url(link)
# file_type = 'pdf' if link.endswith('.pdf') else ('docx' if link.endswith('.docx') else None)
file_type = self.get_href_file_extension(link)

downloadable_items = [
{
"doc_type": file_type,
"download_url": link,
"compression_type": None,
}
]
fields = DocItemFields(
doc_name=doc_name,
doc_title=doc_title,
doc_num=doc_number,
doc_type="DA PAM",
display_doc_type="DA PAM",
publication_date=publication_date,
cac_login_required=False,
source_page_url=response.url,
downloadable_items=downloadable_items,
download_url=link,
file_ext=file_type,
)
# maintain backwards compatability by setting display_title to doc_title in hash fields
fields.set_version_hash_field(
"display_title", fields.doc_title
)

yield fields.populate_doc_item(
display_org=self.display_org,
data_source=self.data_source,
source_title=self.source_title,
crawler_used=self.name,
)
if div_tag is None:
continue
# Find all anchor tags
anchor_tags = soup.find_all("a")

# Extract URLs and text
for tag in anchor_tags:
link = tag["href"]
text = tag.get_text()
yield from self.parse_anchor_tag(
link, text, label_text, container_label, response.url
)

0 comments on commit 3c37947

Please sign in to comment.