New crawler disa pubs (#226)

* initialize new disa pubs crawler * add arguments to runCrawler.sh file * refactor disa crawler, add description * separate site spcific logic from Item logic * add doc strings * add PR template and trigger workflow * add new crawler to tuesday schedule * add helpful information to bash script * update display title * updated Dockerfile to set ubuntu version and tzdata values --------- Co-authored-by: Matthew Kersting <[email protected]> Co-authored-by: Ant_sega <[email protected]>
dod-advana · Apr 30, 2024 · 5a43724 · 5a43724
1 parent 85337b8
commit 5a43724
Show file tree

Hide file tree

Showing 7 changed files with 258 additions and 8 deletions.
diff --git a/.github/workflows/pull_request_template.md b/.github/workflows/pull_request_template.md
@@ -0,0 +1,14 @@
+## Description
+
+
+## Result of Crawler Run on Dev
+```yaml 
+
+```
+
+## Example Metadata
+```javascript 
+{
+
+}
+```
diff --git a/.github/workflows/verify-spiders-scheduled.yml b/.github/workflows/verify-spiders-scheduled.yml
@@ -8,7 +8,7 @@ on:
   push:
     branches: [ dev ]
   pull_request:
-    branches: [ dev ]
+    branches: [ dev, main ]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:

diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,12 @@
-FROM --platform=linux/amd64 ubuntu:latest
+FROM --platform=linux/amd64 ubuntu:20.04
+
+# Set timezone for tzdata
+ENV TZ=UTC
+
+# Install tzdata non-interactively
+RUN ln -fs /usr/share/zoneinfo/$TZ /etc/localtime && \
+    apt-get update && \
+    apt-get install -y tzdata
 
 # Update and install necessary packages
 RUN apt-get update && apt-get upgrade -y ca-certificates && \

diff --git a/dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py b/dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
@@ -0,0 +1,94 @@
+from urllib.parse import urlparse
+from datetime import datetime
+
+from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest
+from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
+
+
+class DocItemFields:
+    """Designed to store all fields necessary to generate DocItems"""
+
+    def __init__(
+        self,
+        doc_name: str,
+        doc_title: str,
+        doc_num: str,
+        doc_type: str,
+        publication_date: datetime,
+        cac_login_required: bool,
+        source_page_url: str,
+        downloadable_items: dict,
+        download_url: str,
+        file_ext: str,
+    ):
+        self.doc_name = doc_name
+        self.doc_title = doc_title
+        self.doc_num = doc_num
+        self.doc_type = doc_type
+        self.display_doc_type = doc_type
+        self.publication_date = publication_date.strftime("%Y-%m-%dT%H:%M:%S")
+        self.cac_login_required = cac_login_required
+        self.source_page_url = source_page_url
+        self.downloadable_items = downloadable_items
+        self.download_url = download_url
+        self.file_ext = file_ext
+        self.display_title = doc_type + " " + doc_num + ": " + doc_title
+
+    def get_version_hash_fields(self) -> dict:
+        """Returns a dict of the fields used for hashing"""
+        return {
+            "doc_name": self.doc_name,
+            "doc_num": self.doc_num,
+            "publication_date": self.publication_date,
+            "download_url": self.download_url,
+            "display_title": self.doc_title,
+        }
+
+    def set_display_name(self, name: str) -> None:
+        """Update display name for DocItemFields instance"""
+        self.display_title = name
+
+    def populate_doc_item(
+        self, display_org: str, data_source: str, source_title: str, crawler_used: str
+    ) -> DocItem:
+        """Takes the data stored in the current object and populates then returns a scrapy DocItem
+
+        Args:
+            display_org (str): Level 1 - GC app 'Source' filter for docs from this crawler
+            data_source (str): Level 2 - GC app 'Source' metadata field for docs from this crawler
+            source_title (str): Level 3 - filter
+            crawler_used (str): name of crawler used
+
+        Returns:
+            DocItem: scrapy.Item sublcass for storing Documents in GC
+        """
+
+        display_source = data_source + " - " + source_title
+        is_revoked = False
+        source_fqdn = urlparse(self.source_page_url).netloc
+        version_hash_fields = self.get_version_hash_fields()
+        version_hash = dict_to_sha256_hex_digest(version_hash_fields)
+
+        return DocItem(
+            doc_name=self.doc_name,
+            doc_title=self.doc_title,
+            doc_num=self.doc_num,
+            doc_type=self.doc_type,
+            display_doc_type=self.display_doc_type,
+            publication_date=self.publication_date,
+            cac_login_required=self.cac_login_required,
+            crawler_used=crawler_used,
+            downloadable_items=self.downloadable_items,
+            source_page_url=self.source_page_url,
+            source_fqdn=source_fqdn,
+            download_url=self.download_url,
+            version_hash_raw_data=version_hash_fields,
+            version_hash=version_hash,
+            display_org=display_org,
+            data_source=data_source,
+            source_title=source_title,
+            display_source=display_source,
+            display_title=self.display_title,
+            file_ext=self.file_ext,
+            is_revoked=is_revoked,
+        )
diff --git a/dataPipelines/gc_scrapy/gc_scrapy/spiders/disa_pubs_spider.py b/dataPipelines/gc_scrapy/gc_scrapy/spiders/disa_pubs_spider.py
@@ -0,0 +1,93 @@
+from typing import Any, Generator
+from urllib.parse import urljoin
+from datetime import datetime
+import bs4
+import scrapy
+
+from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields
+from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
+from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
+
+
+class DisaPubsSpider(GCSpider):
+    """
+    As of 04/26/2024
+    crawls https://disa.mil/About/DISA-Issuances/Instructions for 42 pdfs (doc_type = Instruction)
+    and https://disa.mil/About/DISA-Issuances/Circulars for 6 pdfs (doc_type = Circulars)
+    """
+
+    # Crawler name
+    name = "DISA_pubs"
+    # Level 1: GC app 'Source' filter for docs from this crawler
+    display_org = "Defense Information Systems Agency"
+    # Level 2: GC app 'Source' metadata field for docs from this crawler
+    data_source = "Defense Information Systems Agency"
+    # Level 3 filter
+    source_title = "DISA Policy/Issuances"
+
+    domain = "disa.mil"
+    base_url = f"https://{domain}"
+    allowed_domains = [domain]
+    start_urls = [
+        urljoin(base_url, "/About/DISA-Issuances/Instructions"),
+        urljoin(base_url, "/About/DISA-Issuances/Circulars"),
+    ]
+
+    rotate_user_agent = True
+    date_format = "%m/%d/%y"
+
+    def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]:
+        """Parses doc items out of DISA Policy/Issuances site"""
+        page_url = response.url
+        soup = bs4.BeautifulSoup(response.body, features="html.parser")
+
+        for row in soup.find(id="main-content").find_all("tr"):
+            row_items = row.find_all("td")
+
+            # Ensure elements are present and skip the header row
+            if len(row_items) != 3:
+                continue
+
+            link_cell, title_cell, publication_cell = row_items
+
+            url = urljoin(self.base_url, link_cell.find("a").get("href"))
+            doc_name = self.ascii_clean(link_cell.find("a").get_text().strip())
+
+            pdf_di = [
+                {"doc_type": "pdf", "download_url": url, "compression_type": None}
+            ]
+
+            fields = DocItemFields(
+                doc_name=doc_name,
+                doc_title=self.ascii_clean(title_cell.get_text().strip()),
+                doc_num=doc_name.split(" ")[-1],
+                doc_type=self.get_doc_type(doc_name),
+                publication_date=self.extract_date(publication_cell.get_text()),
+                cac_login_required=False,
+                source_page_url=page_url,
+                downloadable_items=pdf_di,
+                download_url=url,
+                file_ext="pdf",
+            )
+            fields.set_display_name(f"{fields.doc_name}: {fields.doc_title}")
+
+            yield fields.populate_doc_item(
+                display_org=self.display_org,
+                data_source=self.data_source,
+                source_title=self.source_title,
+                crawler_used=self.name,
+            )
+
+    def extract_date(self, input_date: str) -> datetime:
+        """Takes in dates formatted as 03/17/17 or 04/15/ 13 and returns datetime object"""
+        published = input_date.strip().replace(" ", "")
+        published_timestamp = datetime.strptime(published, self.date_format)
+        return published_timestamp
+
+    def get_doc_type(self, doc_name: str) -> str:
+        """Takes in the doc name and returns the type, only handles DISAC and DISAI docs"""
+        if "DISAC" in doc_name:
+            return "Circular"
+        if "DISAI" in doc_name:
+            return "Instruction"
+        raise ValueError(f"Unexpected value for doc_name {doc_name}")
diff --git a/paasJobs/crawler_schedule/tuesday.txt b/paasJobs/crawler_schedule/tuesday.txt
@@ -2,4 +2,5 @@ army_pubs_spider.py
 cfr_spider.py
 dha_spider
 cnss_spider
-us_code_spider
+us_code_spider
+disa_pubs_spider.py
diff --git a/runCrawler.sh b/runCrawler.sh
@@ -1,10 +1,50 @@
 #!/bin/bash
 
+print_help () {
+    echo "Usage: ./runCrawler.sh -c={crawler name} --reset {optional: resets the data directory before running}"
+    echo "Example: ./runCrawler.sh -c=disa_pubs_spider --reset"
+    exit 1
+}
+
+RESET=false
+CRAWLER=""
+for i in "$@"; do
+    case $i in
+        -c=*|--crawler=*)
+            CRAWLER="${i#*=}"
+            shift # past argument=value
+        ;;
+        --reset)
+            RESET=true
+            shift # past argument with no value
+        ;;
+        -*|--*)
+            print_help
+        ;;
+        *)
+        ;;
+    esac
+done
+
+if [$CRAWLER == ""]; then
+    echo "ERROR: Please use the -c option to specify a crawler"
+    print_help
+fi
+
 export PYTHONPATH="$(pwd)"
-CRAWLER_DATA_ROOT=./tmp
+CRAWLER_DATA_ROOT=./tmp/$CRAWLER
 mkdir -p "$CRAWLER_DATA_ROOT"
+
+echo "CRAWLER   = ${CRAWLER}"
+echo "RESET     = ${RESET}"
+echo "DATA_ROOT = ${CRAWLER_DATA_ROOT}"
+
+if $RESET; then
+    rm $CRAWLER_DATA_ROOT/*
+fi
+
 touch "$CRAWLER_DATA_ROOT/prev-manifest.json"
-scrapy runspider dataPipelines/gc_scrapy/gc_scrapy/spiders/hasc_spider.py \
--a download_output_dir="$CRAWLER_DATA_ROOT" \
--a previous_manifest_location="$CRAWLER_DATA_ROOT/prev-manifest.json" \
--o "$CRAWLER_DATA_ROOT/output.json"
+scrapy runspider dataPipelines/gc_scrapy/gc_scrapy/spiders/$CRAWLER.py \
+    -a download_output_dir="$CRAWLER_DATA_ROOT" \
+    -a previous_manifest_location="$CRAWLER_DATA_ROOT/prev-manifest.json" \
+    -o "$CRAWLER_DATA_ROOT/output.json"