Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New crawler disa pubs #226

Merged
merged 10 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Description


## Result of Crawler Run on Dev
```yaml

```

## Example Metadata
```javascript
{

}
```
2 changes: 1 addition & 1 deletion .github/workflows/verify-spiders-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
push:
branches: [ dev ]
pull_request:
branches: [ dev ]
branches: [ dev, main ]

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
Expand Down
10 changes: 9 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
FROM --platform=linux/amd64 ubuntu:latest
FROM --platform=linux/amd64 ubuntu:20.04

# Set timezone for tzdata
ENV TZ=UTC

# Install tzdata non-interactively
RUN ln -fs /usr/share/zoneinfo/$TZ /etc/localtime && \
apt-get update && \
apt-get install -y tzdata

# Update and install necessary packages
RUN apt-get update && apt-get upgrade -y ca-certificates && \
Expand Down
94 changes: 94 additions & 0 deletions dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from urllib.parse import urlparse
from datetime import datetime

from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem


class DocItemFields:
"""Designed to store all fields necessary to generate DocItems"""

def __init__(
self,
doc_name: str,
doc_title: str,
doc_num: str,
doc_type: str,
publication_date: datetime,
cac_login_required: bool,
source_page_url: str,
downloadable_items: dict,
download_url: str,
file_ext: str,
):
self.doc_name = doc_name
self.doc_title = doc_title
self.doc_num = doc_num
self.doc_type = doc_type
self.display_doc_type = doc_type
self.publication_date = publication_date.strftime("%Y-%m-%dT%H:%M:%S")
self.cac_login_required = cac_login_required
self.source_page_url = source_page_url
self.downloadable_items = downloadable_items
self.download_url = download_url
self.file_ext = file_ext
self.display_title = doc_type + " " + doc_num + ": " + doc_title

def get_version_hash_fields(self) -> dict:
"""Returns a dict of the fields used for hashing"""
return {
"doc_name": self.doc_name,
"doc_num": self.doc_num,
"publication_date": self.publication_date,
"download_url": self.download_url,
"display_title": self.doc_title,
}

def set_display_name(self, name: str) -> None:
"""Update display name for DocItemFields instance"""
self.display_title = name

def populate_doc_item(
self, display_org: str, data_source: str, source_title: str, crawler_used: str
) -> DocItem:
"""Takes the data stored in the current object and populates then returns a scrapy DocItem

Args:
display_org (str): Level 1 - GC app 'Source' filter for docs from this crawler
data_source (str): Level 2 - GC app 'Source' metadata field for docs from this crawler
source_title (str): Level 3 - filter
crawler_used (str): name of crawler used

Returns:
DocItem: scrapy.Item sublcass for storing Documents in GC
"""

display_source = data_source + " - " + source_title
is_revoked = False
source_fqdn = urlparse(self.source_page_url).netloc
version_hash_fields = self.get_version_hash_fields()
version_hash = dict_to_sha256_hex_digest(version_hash_fields)

return DocItem(
doc_name=self.doc_name,
doc_title=self.doc_title,
doc_num=self.doc_num,
doc_type=self.doc_type,
display_doc_type=self.display_doc_type,
publication_date=self.publication_date,
cac_login_required=self.cac_login_required,
crawler_used=crawler_used,
downloadable_items=self.downloadable_items,
source_page_url=self.source_page_url,
source_fqdn=source_fqdn,
download_url=self.download_url,
version_hash_raw_data=version_hash_fields,
version_hash=version_hash,
display_org=display_org,
data_source=data_source,
source_title=source_title,
display_source=display_source,
display_title=self.display_title,
file_ext=self.file_ext,
is_revoked=is_revoked,
)
93 changes: 93 additions & 0 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/disa_pubs_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from typing import Any, Generator
from urllib.parse import urljoin
from datetime import datetime
import bs4
import scrapy

from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields
from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider


class DisaPubsSpider(GCSpider):
"""
As of 04/26/2024
crawls https://disa.mil/About/DISA-Issuances/Instructions for 42 pdfs (doc_type = Instruction)
and https://disa.mil/About/DISA-Issuances/Circulars for 6 pdfs (doc_type = Circulars)
"""

# Crawler name
name = "DISA_pubs"
# Level 1: GC app 'Source' filter for docs from this crawler
display_org = "Defense Information Systems Agency"
# Level 2: GC app 'Source' metadata field for docs from this crawler
data_source = "Defense Information Systems Agency"
# Level 3 filter
source_title = "DISA Policy/Issuances"

domain = "disa.mil"
base_url = f"https://{domain}"
allowed_domains = [domain]
start_urls = [
urljoin(base_url, "/About/DISA-Issuances/Instructions"),
urljoin(base_url, "/About/DISA-Issuances/Circulars"),
]

rotate_user_agent = True
date_format = "%m/%d/%y"

def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]:
"""Parses doc items out of DISA Policy/Issuances site"""
page_url = response.url
soup = bs4.BeautifulSoup(response.body, features="html.parser")

for row in soup.find(id="main-content").find_all("tr"):
row_items = row.find_all("td")

# Ensure elements are present and skip the header row
if len(row_items) != 3:
continue

link_cell, title_cell, publication_cell = row_items

url = urljoin(self.base_url, link_cell.find("a").get("href"))
doc_name = self.ascii_clean(link_cell.find("a").get_text().strip())

pdf_di = [
{"doc_type": "pdf", "download_url": url, "compression_type": None}
]

fields = DocItemFields(
doc_name=doc_name,
doc_title=self.ascii_clean(title_cell.get_text().strip()),
doc_num=doc_name.split(" ")[-1],
doc_type=self.get_doc_type(doc_name),
publication_date=self.extract_date(publication_cell.get_text()),
cac_login_required=False,
source_page_url=page_url,
downloadable_items=pdf_di,
download_url=url,
file_ext="pdf",
)
fields.set_display_name(f"{fields.doc_name}: {fields.doc_title}")

yield fields.populate_doc_item(
display_org=self.display_org,
data_source=self.data_source,
source_title=self.source_title,
crawler_used=self.name,
)

def extract_date(self, input_date: str) -> datetime:
"""Takes in dates formatted as 03/17/17 or 04/15/ 13 and returns datetime object"""
published = input_date.strip().replace(" ", "")
published_timestamp = datetime.strptime(published, self.date_format)
return published_timestamp

def get_doc_type(self, doc_name: str) -> str:
"""Takes in the doc name and returns the type, only handles DISAC and DISAI docs"""
if "DISAC" in doc_name:
return "Circular"
if "DISAI" in doc_name:
return "Instruction"
raise ValueError(f"Unexpected value for doc_name {doc_name}")
3 changes: 2 additions & 1 deletion paasJobs/crawler_schedule/tuesday.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ army_pubs_spider.py
cfr_spider.py
dha_spider
cnss_spider
us_code_spider
us_code_spider
disa_pubs_spider.py
50 changes: 45 additions & 5 deletions runCrawler.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,50 @@
#!/bin/bash

print_help () {
echo "Usage: ./runCrawler.sh -c={crawler name} --reset {optional: resets the data directory before running}"
echo "Example: ./runCrawler.sh -c=disa_pubs_spider --reset"
exit 1
}

RESET=false
CRAWLER=""
for i in "$@"; do
case $i in
-c=*|--crawler=*)
CRAWLER="${i#*=}"
shift # past argument=value
;;
--reset)
RESET=true
shift # past argument with no value
;;
-*|--*)
print_help
;;
*)
;;
esac
done

if [$CRAWLER == ""]; then
echo "ERROR: Please use the -c option to specify a crawler"
print_help
fi

export PYTHONPATH="$(pwd)"
CRAWLER_DATA_ROOT=./tmp
CRAWLER_DATA_ROOT=./tmp/$CRAWLER
mkdir -p "$CRAWLER_DATA_ROOT"

echo "CRAWLER = ${CRAWLER}"
echo "RESET = ${RESET}"
echo "DATA_ROOT = ${CRAWLER_DATA_ROOT}"

if $RESET; then
rm $CRAWLER_DATA_ROOT/*
fi

touch "$CRAWLER_DATA_ROOT/prev-manifest.json"
scrapy runspider dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py \
-a download_output_dir="$CRAWLER_DATA_ROOT" \
-a previous_manifest_location="$CRAWLER_DATA_ROOT/prev-manifest.json" \
-o "$CRAWLER_DATA_ROOT/output.json"
scrapy runspider dataPipelines/gc_scrapy/gc_scrapy/spiders/$CRAWLER.py \
-a download_output_dir="$CRAWLER_DATA_ROOT" \
-a previous_manifest_location="$CRAWLER_DATA_ROOT/prev-manifest.json" \
-o "$CRAWLER_DATA_ROOT/output.json"
Loading