Skip to content

Commit

Permalink
pick up legal reference book
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew Kersting committed May 15, 2024
1 parent 5feffbd commit d15ee73
Showing 1 changed file with 10 additions and 10 deletions.
20 changes: 10 additions & 10 deletions dataPipelines/gc_scrapy/gc_scrapy/spiders/ic_policies_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class IcPoliciesSpider(GCSeleniumSpider):
crawls https://www.dni.gov/index.php/what-we-do/ic-related-menus/ic-related-links/intelligence-community-directives for 70 pdfs (doc_type = ICD)
and https://www.dni.gov/index.php/what-we-do/ic-related-menus/ic-related-links/intelligence-community-policy-guidance for 29 pdfs (doc_type = ICPG)
and https://www.dni.gov/index.php/what-we-do/ic-related-menus/ic-related-links/intelligence-community-policy-memorandums for 5 pdfs (doc_type = ICPG)
and https://www.dni.gov/index.php/who-we-are/organizations/ogc/ogc-related-menus/ogc-related-content/ic-legal-reference-book for 1 pdf (doc_type = ICLR)
"""

# Crawler name
Expand Down Expand Up @@ -87,7 +88,6 @@ def parse(self, response):
driver.get(page_url)
time.sleep(5)

"""Parse document objects from page of text"""
# parse html response
soup = bs4.BeautifulSoup(driver.page_source, features="html.parser")
div = soup.find("div", attrs={"itemprop": "articleBody"})
Expand Down Expand Up @@ -120,14 +120,15 @@ def parse(self, response):
names = re.findall(name_pattern, data)
try:
parsed_text = names[0]
except Exception as e:
print(e)
print(link)
print(page_url)
parsed_name = parsed_text.split(" ")
doc_name = " ".join(parsed_name[:2])
doc_num = parsed_name[1]
doc_title = re.sub(parsed_text, "", data)
parsed_name = parsed_text.split(" ")
doc_name = " ".join(parsed_name[:2])
doc_num = parsed_name[1]
doc_title = re.sub(parsed_text, "", data)
except IndexError:
split_data = data.split(" ")
doc_name = " ".join(split_data[:-1])
doc_num = split_data[-1]
doc_title = doc_name

pdf_url = abs_url(self.base_url, link)

Expand Down Expand Up @@ -162,7 +163,6 @@ def parse(self, response):
file_ext="pdf",
display_doc_type=self.get_display_doc_type(doc_type),
)
fields.set_display_name(f"{fields.doc_name}: {fields.doc_title}")

yield fields.populate_doc_item(
display_org=self.display_org,
Expand Down

0 comments on commit d15ee73

Please sign in to comment.