-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* initialize new disa pubs crawler * add arguments to runCrawler.sh file * refactor disa crawler, add description * separate site spcific logic from Item logic * add doc strings * add PR template and trigger workflow * add new crawler to tuesday schedule * add helpful information to bash script * update display title * updated Dockerfile to set ubuntu version and tzdata values --------- Co-authored-by: Matthew Kersting <[email protected]> Co-authored-by: Ant_sega <[email protected]>
- Loading branch information
1 parent
85337b8
commit 5a43724
Showing
7 changed files
with
258 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
## Description | ||
|
||
|
||
## Result of Crawler Run on Dev | ||
```yaml | ||
|
||
``` | ||
|
||
## Example Metadata | ||
```javascript | ||
{ | ||
|
||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
from urllib.parse import urlparse | ||
from datetime import datetime | ||
|
||
from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest | ||
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem | ||
|
||
|
||
class DocItemFields: | ||
"""Designed to store all fields necessary to generate DocItems""" | ||
|
||
def __init__( | ||
self, | ||
doc_name: str, | ||
doc_title: str, | ||
doc_num: str, | ||
doc_type: str, | ||
publication_date: datetime, | ||
cac_login_required: bool, | ||
source_page_url: str, | ||
downloadable_items: dict, | ||
download_url: str, | ||
file_ext: str, | ||
): | ||
self.doc_name = doc_name | ||
self.doc_title = doc_title | ||
self.doc_num = doc_num | ||
self.doc_type = doc_type | ||
self.display_doc_type = doc_type | ||
self.publication_date = publication_date.strftime("%Y-%m-%dT%H:%M:%S") | ||
self.cac_login_required = cac_login_required | ||
self.source_page_url = source_page_url | ||
self.downloadable_items = downloadable_items | ||
self.download_url = download_url | ||
self.file_ext = file_ext | ||
self.display_title = doc_type + " " + doc_num + ": " + doc_title | ||
|
||
def get_version_hash_fields(self) -> dict: | ||
"""Returns a dict of the fields used for hashing""" | ||
return { | ||
"doc_name": self.doc_name, | ||
"doc_num": self.doc_num, | ||
"publication_date": self.publication_date, | ||
"download_url": self.download_url, | ||
"display_title": self.doc_title, | ||
} | ||
|
||
def set_display_name(self, name: str) -> None: | ||
"""Update display name for DocItemFields instance""" | ||
self.display_title = name | ||
|
||
def populate_doc_item( | ||
self, display_org: str, data_source: str, source_title: str, crawler_used: str | ||
) -> DocItem: | ||
"""Takes the data stored in the current object and populates then returns a scrapy DocItem | ||
Args: | ||
display_org (str): Level 1 - GC app 'Source' filter for docs from this crawler | ||
data_source (str): Level 2 - GC app 'Source' metadata field for docs from this crawler | ||
source_title (str): Level 3 - filter | ||
crawler_used (str): name of crawler used | ||
Returns: | ||
DocItem: scrapy.Item sublcass for storing Documents in GC | ||
""" | ||
|
||
display_source = data_source + " - " + source_title | ||
is_revoked = False | ||
source_fqdn = urlparse(self.source_page_url).netloc | ||
version_hash_fields = self.get_version_hash_fields() | ||
version_hash = dict_to_sha256_hex_digest(version_hash_fields) | ||
|
||
return DocItem( | ||
doc_name=self.doc_name, | ||
doc_title=self.doc_title, | ||
doc_num=self.doc_num, | ||
doc_type=self.doc_type, | ||
display_doc_type=self.display_doc_type, | ||
publication_date=self.publication_date, | ||
cac_login_required=self.cac_login_required, | ||
crawler_used=crawler_used, | ||
downloadable_items=self.downloadable_items, | ||
source_page_url=self.source_page_url, | ||
source_fqdn=source_fqdn, | ||
download_url=self.download_url, | ||
version_hash_raw_data=version_hash_fields, | ||
version_hash=version_hash, | ||
display_org=display_org, | ||
data_source=data_source, | ||
source_title=source_title, | ||
display_source=display_source, | ||
display_title=self.display_title, | ||
file_ext=self.file_ext, | ||
is_revoked=is_revoked, | ||
) |
93 changes: 93 additions & 0 deletions
93
dataPipelines/gc_scrapy/gc_scrapy/spiders/disa_pubs_spider.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
from typing import Any, Generator | ||
from urllib.parse import urljoin | ||
from datetime import datetime | ||
import bs4 | ||
import scrapy | ||
|
||
from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields | ||
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem | ||
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider | ||
|
||
|
||
class DisaPubsSpider(GCSpider): | ||
""" | ||
As of 04/26/2024 | ||
crawls https://disa.mil/About/DISA-Issuances/Instructions for 42 pdfs (doc_type = Instruction) | ||
and https://disa.mil/About/DISA-Issuances/Circulars for 6 pdfs (doc_type = Circulars) | ||
""" | ||
|
||
# Crawler name | ||
name = "DISA_pubs" | ||
# Level 1: GC app 'Source' filter for docs from this crawler | ||
display_org = "Defense Information Systems Agency" | ||
# Level 2: GC app 'Source' metadata field for docs from this crawler | ||
data_source = "Defense Information Systems Agency" | ||
# Level 3 filter | ||
source_title = "DISA Policy/Issuances" | ||
|
||
domain = "disa.mil" | ||
base_url = f"https://{domain}" | ||
allowed_domains = [domain] | ||
start_urls = [ | ||
urljoin(base_url, "/About/DISA-Issuances/Instructions"), | ||
urljoin(base_url, "/About/DISA-Issuances/Circulars"), | ||
] | ||
|
||
rotate_user_agent = True | ||
date_format = "%m/%d/%y" | ||
|
||
def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]: | ||
"""Parses doc items out of DISA Policy/Issuances site""" | ||
page_url = response.url | ||
soup = bs4.BeautifulSoup(response.body, features="html.parser") | ||
|
||
for row in soup.find(id="main-content").find_all("tr"): | ||
row_items = row.find_all("td") | ||
|
||
# Ensure elements are present and skip the header row | ||
if len(row_items) != 3: | ||
continue | ||
|
||
link_cell, title_cell, publication_cell = row_items | ||
|
||
url = urljoin(self.base_url, link_cell.find("a").get("href")) | ||
doc_name = self.ascii_clean(link_cell.find("a").get_text().strip()) | ||
|
||
pdf_di = [ | ||
{"doc_type": "pdf", "download_url": url, "compression_type": None} | ||
] | ||
|
||
fields = DocItemFields( | ||
doc_name=doc_name, | ||
doc_title=self.ascii_clean(title_cell.get_text().strip()), | ||
doc_num=doc_name.split(" ")[-1], | ||
doc_type=self.get_doc_type(doc_name), | ||
publication_date=self.extract_date(publication_cell.get_text()), | ||
cac_login_required=False, | ||
source_page_url=page_url, | ||
downloadable_items=pdf_di, | ||
download_url=url, | ||
file_ext="pdf", | ||
) | ||
fields.set_display_name(f"{fields.doc_name}: {fields.doc_title}") | ||
|
||
yield fields.populate_doc_item( | ||
display_org=self.display_org, | ||
data_source=self.data_source, | ||
source_title=self.source_title, | ||
crawler_used=self.name, | ||
) | ||
|
||
def extract_date(self, input_date: str) -> datetime: | ||
"""Takes in dates formatted as 03/17/17 or 04/15/ 13 and returns datetime object""" | ||
published = input_date.strip().replace(" ", "") | ||
published_timestamp = datetime.strptime(published, self.date_format) | ||
return published_timestamp | ||
|
||
def get_doc_type(self, doc_name: str) -> str: | ||
"""Takes in the doc name and returns the type, only handles DISAC and DISAI docs""" | ||
if "DISAC" in doc_name: | ||
return "Circular" | ||
if "DISAI" in doc_name: | ||
return "Instruction" | ||
raise ValueError(f"Unexpected value for doc_name {doc_name}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,5 @@ army_pubs_spider.py | |
cfr_spider.py | ||
dha_spider | ||
cnss_spider | ||
us_code_spider | ||
us_code_spider | ||
disa_pubs_spider.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,50 @@ | ||
#!/bin/bash | ||
|
||
print_help () { | ||
echo "Usage: ./runCrawler.sh -c={crawler name} --reset {optional: resets the data directory before running}" | ||
echo "Example: ./runCrawler.sh -c=disa_pubs_spider --reset" | ||
exit 1 | ||
} | ||
|
||
RESET=false | ||
CRAWLER="" | ||
for i in "$@"; do | ||
case $i in | ||
-c=*|--crawler=*) | ||
CRAWLER="${i#*=}" | ||
shift # past argument=value | ||
;; | ||
--reset) | ||
RESET=true | ||
shift # past argument with no value | ||
;; | ||
-*|--*) | ||
print_help | ||
;; | ||
*) | ||
;; | ||
esac | ||
done | ||
|
||
if [$CRAWLER == ""]; then | ||
echo "ERROR: Please use the -c option to specify a crawler" | ||
print_help | ||
fi | ||
|
||
export PYTHONPATH="$(pwd)" | ||
CRAWLER_DATA_ROOT=./tmp | ||
CRAWLER_DATA_ROOT=./tmp/$CRAWLER | ||
mkdir -p "$CRAWLER_DATA_ROOT" | ||
|
||
echo "CRAWLER = ${CRAWLER}" | ||
echo "RESET = ${RESET}" | ||
echo "DATA_ROOT = ${CRAWLER_DATA_ROOT}" | ||
|
||
if $RESET; then | ||
rm $CRAWLER_DATA_ROOT/* | ||
fi | ||
|
||
touch "$CRAWLER_DATA_ROOT/prev-manifest.json" | ||
scrapy runspider dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py \ | ||
-a download_output_dir="$CRAWLER_DATA_ROOT" \ | ||
-a previous_manifest_location="$CRAWLER_DATA_ROOT/prev-manifest.json" \ | ||
-o "$CRAWLER_DATA_ROOT/output.json" | ||
scrapy runspider dataPipelines/gc_scrapy/gc_scrapy/spiders/$CRAWLER.py \ | ||
-a download_output_dir="$CRAWLER_DATA_ROOT" \ | ||
-a previous_manifest_location="$CRAWLER_DATA_ROOT/prev-manifest.json" \ | ||
-o "$CRAWLER_DATA_ROOT/output.json" |