Patch army g1 spider (#228)

* new div selection for updated site * refactor and add doc strings --------- Co-authored-by: Matthew Kersting <[email protected]>
dod-advana · May 8, 2024 · ff85a6a · ff85a6a
1 parent 2592e7e
commit ff85a6a
Show file tree

Hide file tree

Showing 2 changed files with 140 additions and 145 deletions.
diff --git a/dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py b/dataPipelines/gc_scrapy/gc_scrapy/doc_item_fields.py
@@ -38,19 +38,22 @@ def __init__(
         self.file_ext = file_ext
         self.display_title = doc_type + " " + doc_num + ": " + doc_title
 
-    def get_version_hash_fields(self) -> dict:
-        """Returns a dict of the fields used for hashing"""
-        return {
+        self.hash_fields = {
             "doc_name": self.doc_name,
             "doc_num": self.doc_num,
             "publication_date": self.publication_date,
             "download_url": self.download_url,
             "display_title": self.display_title,
         }
 
+    def set_version_hash_field(self, key: str, value: str) -> dict:
+        """Sets a new field or updates an old one in the dict used for hashing"""
+        self.hash_fields[key] = value
+
     def set_display_name(self, name: str) -> None:
         """Update display name for DocItemFields instance"""
         self.display_title = name
+        self.hash_fields["display_title"] = name
 
     def populate_doc_item(
         self, display_org: str, data_source: str, source_title: str, crawler_used: str
@@ -70,8 +73,7 @@ def populate_doc_item(
         display_source = data_source + " - " + source_title
         is_revoked = False
         source_fqdn = urlparse(self.source_page_url).netloc
-        version_hash_fields = self.get_version_hash_fields()
-        version_hash = dict_to_sha256_hex_digest(version_hash_fields)
+        version_hash = dict_to_sha256_hex_digest(self.hash_fields)
 
         return DocItem(
             doc_name=self.doc_name,
@@ -86,7 +88,7 @@ def populate_doc_item(
             source_page_url=self.source_page_url,
             source_fqdn=source_fqdn,
             download_url=self.download_url,
-            version_hash_raw_data=version_hash_fields,
+            version_hash_raw_data=self.hash_fields,
             version_hash=version_hash,
             display_org=display_org,
             data_source=data_source,

diff --git a/dataPipelines/gc_scrapy/gc_scrapy/spiders/army_g1_spider.py b/dataPipelines/gc_scrapy/gc_scrapy/spiders/army_g1_spider.py
@@ -1,175 +1,168 @@
-import scrapy
+from typing import Any, Generator, Union
 import re
-import time
-from urllib.parse import urljoin, urlparse
-from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
-from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
-from dataPipelines.gc_scrapy.gc_scrapy.utils import dict_to_sha256_hex_digest, get_pub_date
-from datetime import datetime
 from bs4 import BeautifulSoup
-import json
 import html
+from datetime import datetime
+import scrapy
+
+from dataPipelines.gc_scrapy.gc_scrapy.GCSpider import GCSpider
+from dataPipelines.gc_scrapy.gc_scrapy.doc_item_fields import DocItemFields
+from dataPipelines.gc_scrapy.gc_scrapy.items import DocItem
+
 
 class ArmyG1Spider(GCSpider):
-    name = 'army_g1_pubs'
-    start_urls = ['https://www.army.mil/g-1#org-g-1-publications']
+    """
+    As of 05/01/2024
+    crawls https://www.army.mil/g-1#org-g-1-publications for 122 pdfs (doc_type = DA PAM)
+    """
+
+    # Crawler name
+    name = "army_g1_pubs"
+    # Level 1: GC app 'Source' filter for docs from this crawler
+    display_org = "Dept. of the Army"
+    # Level 2: GC app 'Source' metadata field for docs from this crawler
+    data_source = "Army Publishing Directorate"
+    # Level 3 filter
+    source_title = "G-1 Publications"
+
+    start_urls = ["https://www.army.mil/g-1#org-g-1-publications"]
     rotate_user_agent = True
     randomly_delay_request = True
     custom_settings = {
         **GCSpider.custom_settings,
         "DOWNLOAD_DELAY": 5,
-        "AUTOTHROTTLE_ENABLED": True, 
+        "AUTOTHROTTLE_ENABLED": True,
         "AUTOTHROTTLE_START_DELAY": 1,
         "AUTOTHROTTLE_MAX_DELAY": 10,
         "CONCURRENT_REQUESTS_PER_DOMAIN": 2,
     }
 
-    def encoding(self, text):
+    @staticmethod
+    def is_ascii_encoded(text: str) -> bool:
+        """Returns true if text is ascii encoded"""
         try:
-            text.encode('ascii')
+            text.encode("ascii")
             return False
         except UnicodeEncodeError:
             return True
 
-    def extract_doc_name_from_url(self, url):
-        doc_name =  url.split('/')[-1].split('.')[0]
+    @staticmethod
+    def extract_doc_name_from_url(url: str) -> str:
+        """Parses doc name out of url"""
+        doc_name = url.split("/")[-1].split(".")[0]
         return doc_name
 
-    def extract_doc_number(self, doc_number):
-        pattern = r'(\d{2,4}-\d{1,4})'
-        match = re.search(pattern, doc_number)    
+    @staticmethod
+    def extract_doc_number(text: str):
+        """Uses regex to pull doc number from container label"""
+        pattern = r"(\d{2,4}-\d{1,4})"
+        match = re.search(pattern, text)
         if match:
             return match.group(1)
-        else:
-            return 'N/A'
+        return "N/A"
 
-    def title_edge_cases(self, text, label):
-        # renames documents if incorrect on website
+    @staticmethod
+    def title_edge_cases(text: str, label: str) -> str:
+        """Renames documents if incorrect on website"""
         if "Board Brief; NCO Evaluation Board Supplement" in text:
-            return (label + " Board Brief")
-        elif "NCO Evaluation Board Supplement" in text:
+            return label + " Board Brief"
+        if "NCO Evaluation Board Supplement" in text:
             return label
-        elif text.endswith('.pdf') or text.endswith('docx'):
+        if text.endswith(".pdf") or text.endswith("docx"):
             return label
-        else:
-            pattern = r'(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}'
-            cleaned_text = re.sub(pattern, '', text)
-            stripped_text = cleaned_text.strip()
-            if "\\xc2\\xa0" in stripped_text:
-                stripped_text = stripped_text.replace("\\xc2\\xa0", " ")
-            decoded_text = html.unescape(stripped_text)
-            return decoded_text
-
-    def extract_date_from_url(self, url):
-        pattern = r'(\d{4}/\d{2}/\d{2})'
-        match = re.search(pattern, url)    
+        pattern = r"(?:DA\s+)?PAM\s+\d{2,4}-\d{2,4}"
+        cleaned_text = re.sub(pattern, "", text)
+        stripped_text = cleaned_text.strip()
+        if "\\xc2\\xa0" in stripped_text:
+            stripped_text = stripped_text.replace("\\xc2\\xa0", " ")
+        decoded_text = html.unescape(stripped_text)
+        return decoded_text
+
+    @staticmethod
+    def extract_date_from_url(url: str) -> Union[datetime, str]:
+        """Accepts url then parses and returns a datetime object"""
+        pattern = r"(\d{4}/\d{2}/\d{2})"
+        match = re.search(pattern, url)
         if match:
             date = match.group(1)
             datetime_ = datetime.strptime(date, "%Y/%m/%d")
-            return datetime_.strftime("%m-%d-%Y")
-        else:
-            return "Unknown"
-
-
-    def parse(self, response):
-        for container in response.css('.inner-container'):
+            return datetime_
+        return "Unknown"
+
+    def parse_anchor_tag(
+        self, link: str, text: str, label_text: str, container_label: str, url: str
+    ) -> Generator[DocItem, Any, None]:
+        """Takes in data from anchor tag element and returns the DocItem"""
+        # only consider links that lead to documents
+        if link.endswith(".pdf") or link.endswith(".docx"):
+            # check if title needs to be encoded before conversion to string
+            if self.is_ascii_encoded(text):
+                text = str(text.encode("utf-8"))[2:-1]
+
+            # clean data for `fields` dictionary
+            doc_title = self.title_edge_cases(text, label_text)
+            doc_number = self.extract_doc_number(container_label)
+            doc_name = self.extract_doc_name_from_url(link)
+            publication_date = self.extract_date_from_url(link)
+            # 'pdf' if link.endswith('.pdf'), 'docx' if link.endswith('.docx'), else None
+            file_type = self.get_href_file_extension(link)
+
+            downloadable_items = [
+                {
+                    "doc_type": file_type,
+                    "download_url": link,
+                    "compression_type": None,
+                }
+            ]
+            fields = DocItemFields(
+                doc_name=doc_name,
+                doc_title=doc_title,
+                doc_num=doc_number,
+                doc_type="DA PAM",
+                display_doc_type="DA PAM",
+                publication_date=publication_date,
+                cac_login_required=False,
+                source_page_url=url,
+                downloadable_items=downloadable_items,
+                download_url=link,
+                file_ext=file_type,
+            )
+            # backwards compatability by setting display_title to doc_title in hash fields
+            fields.set_version_hash_field("display_title", fields.doc_title)
+
+            yield fields.populate_doc_item(
+                display_org=self.display_org,
+                data_source=self.data_source,
+                source_title=self.source_title,
+                crawler_used=self.name,
+            )
+
+    def parse(self, response: scrapy.http.Response) -> Generator[DocItem, Any, None]:
+        """Parses doc items out of Army G1 Publications site"""
+        for container in response.css(".inner-container"):
             # title of each section
-            container_label = container.css('h4::text').extract_first()
+            container_label = container.css("h4::text").extract_first()
+
+            for accordion in container.css(".accordion-container"):
 
-            for accordion in container.css('.accordion-container'):
+                for item in accordion.css(".accordion"):
 
-                for item in accordion.css('.accordion li'):
+                    # get title text *within* each accordion tab
+                    label_text = item.css("label[for]::text").get().strip()
 
-                    # get title text *within* each accordion tab           
-                    label_text = item.css('label[for]::text').get().strip()
-
                     # convert html to string
-                    soup = BeautifulSoup(item.get(), 'html.parser')
-                    div_tag = soup.find('div', class_='rich-text-element bodytext')
-
-                    if div_tag:
-                        delta_data = div_tag.get('data-delta')
-                        if delta_data:
-                            # parse delta_data as JSON
-                            data = json.loads(delta_data)
-                            for op in data["ops"]:
-                                if 'attributes' in op and 'link' in op['attributes']:
-                                    # URL link
-                                    link = op['attributes']['link']
-
-                                    # only consider links that lead to documents
-                                    if link.endswith('.pdf') or link.endswith('.docx'):
-                                        # extract title
-                                        text = op['insert']
-
-                                        # check if title needs to be encoded before conversion to string
-                                        if self.encoding(text):
-                                            text = str(text.encode('utf-8'))[2:-1]
-
-                                        # clean data for `fields` dictionary
-                                        doc_title = self.title_edge_cases(text, label_text)
-                                        doc_number = self.extract_doc_number(container_label)
-                                        doc_name = self.extract_doc_name_from_url(link)
-                                        publication_date = self.extract_date_from_url(link)
-                                        #file_type = 'pdf' if link.endswith('.pdf') else ('docx' if link.endswith('.docx') else None)
-                                        file_type = self.get_href_file_extension(link)
-
-                                        fields = {
-                                            'doc_name': doc_name,
-                                            'doc_num': doc_number,
-                                            'doc_title': doc_title,
-                                            'doc_type': "DA PAM",
-                                            'display_doc_type': "DA PAM",
-                                            'file_type': file_type,
-                                            'download_url': link,
-                                            'source_page_url': response.url,
-                                            'publication_date': publication_date,
-                                            'cac_login_required': False,
-                                            'is_revoked': False
-                                        }
-
-                                        doc_item = self.populate_doc_item(fields)
-                                        yield doc_item
-
-    def populate_doc_item(self, fields):
-        display_org = "Dept. of the Army"
-        data_source = "Army Publishing Directorate"
-        source_title = "G-1 Publications"
-
-        version_hash_fields = {
-            "doc_name": fields['doc_name'],
-            "doc_num": fields['doc_num'],
-            "publication_date": get_pub_date(fields['publication_date']),
-            "download_url": fields['download_url'],
-            "display_title": fields['doc_title']
-        }
-
-        version_hash = dict_to_sha256_hex_digest(version_hash_fields)
-
-        return DocItem(
-            doc_name=fields['doc_name'],
-            doc_title=fields['doc_title'],
-            doc_num=fields['doc_num'],
-            doc_type=fields['doc_type'],
-            display_doc_type=fields['display_doc_type'],
-            publication_date=get_pub_date(fields['publication_date']),
-            cac_login_required=fields['cac_login_required'],
-            crawler_used=self.name,
-            downloadable_items=[{
-                "doc_type": fields['file_type'],
-                "download_url": fields['download_url'],
-                "compression_type": None
-            }],
-            source_page_url=fields['source_page_url'],
-            source_fqdn=urlparse(fields['source_page_url']).netloc,
-            download_url=fields['download_url'],
-            version_hash_raw_data=version_hash_fields,
-            version_hash=version_hash,
-            display_org=display_org,
-            data_source=data_source,
-            source_title=source_title,
-            display_source=data_source + " - " + source_title,
-            display_title=fields['doc_type'] + " " + fields['doc_num'] + ": " + fields['doc_title'],
-            file_ext=fields['file_type'], # 'pdf'
-            is_revoked=fields['is_revoked']
-        )
+                    soup = BeautifulSoup(item.get(), "html.parser")
+                    div_tag = soup.find("div", class_="rich-text-element bodytext")
+
+                    if div_tag is None:
+                        continue
+                    # Find all anchor tags
+                    anchor_tags = soup.find_all("a")
+
+                    # Extract URLs and text
+                    for tag in anchor_tags:
+                        link = tag["href"]
+                        text = tag.get_text()
+                        yield from self.parse_anchor_tag(
+                            link, text, label_text, container_label, response.url
+                        )