-
Notifications
You must be signed in to change notification settings - Fork 2
/
scraper_db.py
161 lines (129 loc) · 4.81 KB
/
scraper_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
'''
Uses Anaconda environment "webby" with Selenium (to load JavaScript on the page
with the content), then Beautiful Soup to parse the HTML:
- Navigate into the project folder
- Create environment:
`conda env create --file environment_webby.yml`
- Switch to environment: `conda activate webby`
- Run script: `python scraper.py`
- Deactivate the environment when done: `conda deactivate`
Notes:
1) Depending on your system, the scraper may take ~1.5-2 days to run - there
are over 8,000 comments and the program bakes in sleep time to allow the
page to load and to space requests so as not to overload the site's server
2) Web scrapers are brittle by nature - changes to the HTML structure of the
target webpage can break the scraper. Always check that the path to the
unique element the scraper targets is updated in the script
'''
import os
import csv
import time
import sqlite3
from sqlite3 import Error
from selenium import webdriver
import bs4
def main():
# Paths and inputs
base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'Data/')
db_file = data_dir + 'comments.db'
# CSS Selector on each page that holds the comment text
css_sel = 'div.px-2:nth-child(2)'
# CSV with URL to each comment (downloaded from docket page)
csv_file_path = data_dir + 'scraper_csv.csv'
# Create table statement
sql_create_table = """CREATE TABLE IF NOT EXISTS comments (
ID text PRIMARY KEY,
Comment text
);"""
# Establish database connection
db_conn = create_conn(db_file)
# Create table
if db_conn is not None:
create_table(db_conn, sql_create_table)
else:
print("ERROR: Failed to connect to database!")
# Run the web scraper
scrape_comments(csv_file_path, css_sel, db_conn)
# Close the database connection
db_conn.close()
def create_conn(db_file):
'''
:param db_path: path and file name where database will be saved
:return: connection to database
'''
conn = None
try:
conn = sqlite3.connect(db_file)
return conn
except Error as e:
print(e)
def create_table(db_conn, create_table_sql):
'''
Creates a table from the `create_table_sql` and saves to database linked
via the `db_conn` connection.
:param db_conn: sqlite Connection object to database
:param create_table_sql: str, a SQL CREATE TABLE statement
:return: None
'''
try:
cursor = db_conn.cursor()
cursor.execute(create_table_sql)
db_conn.commit()
except Error as e:
print(e)
def scrape_comments(csv_file_path, css_sel, db_conn):
'''
The web scraper loops over a CSV file located at `csv_file_path` that has
unique web addresses for each comment, grabs text from `css_sel element` on
the page, and saves the comment's ID and text to a database linked via the
`db_conn` connection.
:param csv_file_path: path to a CSV-formatted file with two columns - a
comment ID and the URL to that comment. Assumes CSV has a header row
:param css_sel: CSS selector path to the element containing the targeted
text at the given URL
:param db_conn: sqlite Connection object to database
:return: None
>>>scrape_comments('./my_csv_data', 'div:nth-child(2)', db_conn)
'''
with open(csv_file_path, newline='') as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader) # Skip header
count = 0
for row in csvreader:
if count == 5:
break
count += 1
ID = row[0]
url = row[1]
# Open a browser and fetch the comment text
browser = webdriver.Firefox()
browser.get(url)
time.sleep(4) # Allow time for page to load
inHTML = browser.execute_script("return document.body.innerHTML")
soup = bs4.BeautifulSoup(inHTML, features="lxml")
elem = soup.select(css_sel)
try:
comment = elem[0].text
print(comment)
except IndexError as e:
print('Error processing comment: {}'.format(ID))
print(e)
browser.quit()
continue
# Save ID and comment to database
try:
cursor = db_conn.cursor()
cursor.execute("INSERT INTO comments VALUES (?, ?)",
(ID, comment))
db_conn.commit()
except Error as e:
print('Error saving comment {} to database'
.format(ID))
print(e)
continue
time.sleep(1)
browser.quit()
if __name__ == '__main__':
main()