Skip to content

Commit

Permalink
New crawler disa pubs (#226)
Browse files Browse the repository at this point in the history
* initialize new disa pubs crawler

* add arguments to runCrawler.sh file

* refactor disa crawler, add description

* separate site spcific logic from Item logic

* add doc strings

* add PR template and trigger workflow

* add new crawler to tuesday schedule

* add helpful information to bash script

* update display title

* updated Dockerfile to set ubuntu version and tzdata values

---------

Co-authored-by: Matthew Kersting <[email protected]>
Co-authored-by: Ant_sega <[email protected]>
  • Loading branch information
3 people committed Apr 30, 2024
1 parent 85337b8 commit 5a43724
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 8 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Description


## Result of Crawler Run on Dev
```yaml

```

## Example Metadata
```javascript
{

}
```
2 changes: 1 addition & 1 deletion .github/workflows/verify-spiders-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
push:
branches: [ dev ]
pull_request:
branches: [ dev ]
branches: [ dev, main ]

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
Expand Down
10 changes: 9 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
FROM --platform=linux/amd64 ubuntu:latest
FROM --platform=linux/amd64 ubuntu:20.04

# Set timezone for tzdata
ENV TZ=UTC

# Install tzdata non-interactively
RUN ln -fs /usr/share/zoneinfo/$TZ /etc/localtime && \
apt-get update && \
apt-get install -y tzdata

# Update and install necessary packages
RUN apt-get update && apt-get upgrade -y ca-certificates && \
Expand Down
94 changes: 94 additions & 0 deletions dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from urllib.parse import urlparse
from datetime import datetime

from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem


class DocItemFields:
"""Designed to store all fields necessary to generate DocItems"""

def __init__(
self,
doc_name: str,
doc_title: str,
doc_num: str,
doc_type: str,
publication_date: datetime,
cac_login_required: bool,
source_page_url: str,
downloadable_items: dict,
download_url: str,
file_ext: str,
):
self.doc_name = doc_name
self.doc_title = doc_title
self.doc_num = doc_num
self.doc_type = doc_type
self.display_doc_type = doc_type
self.publication_date = publication_date.strftime("%Y-%m-%dT%H:%M:%S")
self.cac_login_required = cac_login_required
self.source_page_url = source_page_url
self.downloadable_items = downloadable_items
self.download_url = download_url
self.file_ext = file_ext
self.display_title = doc_type + " " + doc_num + ": " + doc_title

def get_version_hash_fields(self) -> dict:
"""Returns a dict of the fields used for hashing"""
return {
"doc_name": self.doc_name,
"doc_num": self.doc_num,
"publication_date": self.publication_date,
"download_url": self.download_url,
"display_title": self.doc_title,
}

def set_display_name(self, name: str) -> None:
"""Update display name for DocItemFields instance"""
self.display_title = name

def populate_doc_item(
self, display_org: str, data_source: str, source_title: str, crawler_used: str
) -> DocItem:
"""Takes the data stored in the current object and populates then returns a scrapy DocItem
Args:
display_org (str): Level 1 - GC app 'Source' filter for docs from this crawler
data_source (str): Level 2 - GC app 'Source' metadata field for docs from this crawler
source_title (str): Level 3 - filter
crawler_used (str): name of crawler used
Returns:
DocItem: scrapy.Item sublcass for storing Documents in GC
"""

display_source = data_source + " - " + source_title
is_revoked = False
source_fqdn = urlparse(self.source_page_url).netloc
version_hash_fields = self.get_version_hash_fields()
version_hash = dict_to_sha256_hex_digest(version_hash_fields)

return DocItem(
doc_name=self.doc_name,
doc_title=self.doc_title,
doc_num=self.doc_num,
doc_type=self.doc_type,
display_doc_type=self.display_doc_type,
publication_date=self.publication_date,
cac_login_required=self.cac_login_required,
crawler_used=crawler_used,
downloadable_items=self.downloadable_items,
source_page_url=self.source_page_url,
source_fqdn=source_fqdn,
download_url=self.download_url,
version_hash_raw_data=version_hash_fields,
version_hash=version_hash,
display_org=display_org,
data_source=data_source,
source_title=source_title,
display_source=display_source,
display_title=self.display_title,
file_ext=self.file_ext,
is_revoked=is_revoked,
)
93 changes: 93 additions & 0 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/disa_pubs_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from typing import Any, Generator
from urllib.parse import urljoin
from datetime import datetime
import bs4
import scrapy

from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider


class DisaPubsSpider(GCSpider):
"""
As of 04/26/2024
crawls https://disa.mil/About/DISA-Issuances/Instructions for 42 pdfs (doc_type = Instruction)
and https://disa.mil/About/DISA-Issuances/Circulars for 6 pdfs (doc_type = Circulars)
"""

# Crawler name
name = "DISA_pubs"
# Level 1: GC app 'Source' filter for docs from this crawler
display_org = "Defense Information Systems Agency"
# Level 2: GC app 'Source' metadata field for docs from this crawler
data_source = "Defense Information Systems Agency"
# Level 3 filter
source_title = "DISA Policy/Issuances"

domain = "disa.mil"
base_url = f"https://{domain}"
allowed_domains = [domain]
start_urls = [
urljoin(base_url, "/About/DISA-Issuances/Instructions"),
urljoin(base_url, "/About/DISA-Issuances/Circulars"),
]

rotate_user_agent = True
date_format = "%m/%d/%y"

def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]:
"""Parses doc items out of DISA Policy/Issuances site"""
page_url = response.url
soup = bs4.BeautifulSoup(response.body, features="html.parser")

for row in soup.find(id="main-content").find_all("tr"):
row_items = row.find_all("td")

# Ensure elements are present and skip the header row
if len(row_items) != 3:
continue

link_cell, title_cell, publication_cell = row_items

url = urljoin(self.base_url, link_cell.find("a").get("href"))
doc_name = self.ascii_clean(link_cell.find("a").get_text().strip())

pdf_di = [
{"doc_type": "pdf", "download_url": url, "compression_type": None}
]

fields = DocItemFields(
doc_name=doc_name,
doc_title=self.ascii_clean(title_cell.get_text().strip()),
doc_num=doc_name.split(" ")[-1],
doc_type=self.get_doc_type(doc_name),
publication_date=self.extract_date(publication_cell.get_text()),
cac_login_required=False,
source_page_url=page_url,
downloadable_items=pdf_di,
download_url=url,
file_ext="pdf",
)
fields.set_display_name(f"{fields.doc_name}: {fields.doc_title}")

yield fields.populate_doc_item(
display_org=self.display_org,
data_source=self.data_source,
source_title=self.source_title,
crawler_used=self.name,
)

def extract_date(self, input_date: str) -> datetime:
"""Takes in dates formatted as 03/17/17 or 04/15/ 13 and returns datetime object"""
published = input_date.strip().replace(" ", "")
published_timestamp = datetime.strptime(published, self.date_format)
return published_timestamp

def get_doc_type(self, doc_name: str) -> str:
"""Takes in the doc name and returns the type, only handles DISAC and DISAI docs"""
if "DISAC" in doc_name:
return "Circular"
if "DISAI" in doc_name:
return "Instruction"
raise ValueError(f"Unexpected value for doc_name {doc_name}")
3 changes: 2 additions & 1 deletion paasJobs/crawler_schedule/tuesday.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ army_pubs_spider.py
cfr_spider.py
dha_spider
cnss_spider
us_code_spider
us_code_spider
disa_pubs_spider.py
50 changes: 45 additions & 5 deletions runCrawler.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,50 @@
#!/bin/bash

print_help () {
echo "Usage: ./runCrawler.sh -c={crawler name} --reset {optional: resets the data directory before running}"
echo "Example: ./runCrawler.sh -c=disa_pubs_spider --reset"
exit 1
}

RESET=false
CRAWLER=""
for i in "$@"; do
case $i in
-c=*|--crawler=*)
CRAWLER="${i#*=}"
shift # past argument=value
;;
--reset)
RESET=true
shift # past argument with no value
;;
-*|--*)
print_help
;;
*)
;;
esac
done

if [$CRAWLER == ""]; then
echo "ERROR: Please use the -c option to specify a crawler"
print_help
fi

export PYTHONPATH="$(pwd)"
CRAWLER_DATA_ROOT=./tmp
CRAWLER_DATA_ROOT=./tmp/$CRAWLER
mkdir -p "$CRAWLER_DATA_ROOT"

echo "CRAWLER = ${CRAWLER}"
echo "RESET = ${RESET}"
echo "DATA_ROOT = ${CRAWLER_DATA_ROOT}"

if $RESET; then
rm $CRAWLER_DATA_ROOT/*
fi

touch "$CRAWLER_DATA_ROOT/prev-manifest.json"
scrapy runspider dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py \
-a download_output_dir="$CRAWLER_DATA_ROOT" \
-a previous_manifest_location="$CRAWLER_DATA_ROOT/prev-manifest.json" \
-o "$CRAWLER_DATA_ROOT/output.json"
scrapy runspider dataPipelines/gc_scrapy/gc_scrapy/spiders/$CRAWLER.py \
-a download_output_dir="$CRAWLER_DATA_ROOT" \
-a previous_manifest_location="$CRAWLER_DATA_ROOT/prev-manifest.json" \
-o "$CRAWLER_DATA_ROOT/output.json"

0 comments on commit 5a43724

Please sign in to comment.