-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
157 lines (139 loc) · 8.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import json
import signal
import time
from config import COMPANY_INDUSTRY_COLUMN, COMPANY_NAME_COLUMN, COMPANY_SIZE_COLUMN, DEFAULT_FUNCTION_TIMEOUT, LINKEDIN_PROFILE_COLUMN, SHEET_ID, SHEET_NAME, SHEETS_FILE_ID, STAT_FILE_NAME
from custom_exceptions import timeout_handler
from scrap import google_search, get_company_size_and_industry, linked_search, quit_driver, start_driver
from sheets import google_auth, get_sheets_data, update_sheet
# Initialize Google Sheets
sheet_service = google_auth()
def get_company_details(driver, company_name, name_profile_map):
"""
This function takes in a driver, company_name, and name_profile_map as parameters.
It prints the company_name and then initializes an empty dictionary called company_details.
The function then checks if the company_profile exists in the name_profile_map dictionary.
If it does, it assigns the corresponding value to company_profile.
If not, it calls the linked_search(driver, company_name) function.
If that returns None, it calls the google_search(driver, company_name) function.
The result of either function call is assigned to company_profile.
If company_profile is not None, the function proceeds to clean up the URL by removing any query parameters.
It then removes any trailing slashes from the URL and prints the search result for company_name and the cleaned up company_profile.
The company_profile is then assigned to the LINKEDIN_PROFILE_COLUMN key in the company_details dictionary.
The function then calls the get_company_size_and_industry(driver, f'{company_profile}/about/') function
to get the company size and industry information.
The results are printed and assigned to the COMPANY_SIZE_COLUMN and COMPANY_INDUSTRY_COLUMN keys in the company_details dictionary.
Finally, the function returns the company_details dictionary.
"""
print(f'\ncompany_name: {company_name}')
company_details = {}
company_profile = name_profile_map.get(company_name) or linked_search(driver, company_name) or google_search(driver, company_name)
if company_profile:
company_profile = company_profile.split('?')[0]
company_profile = company_profile.rstrip('/')
print(f'search result for: {company_name} is company_profile: {company_profile}')
company_details[LINKEDIN_PROFILE_COLUMN] = company_profile
print(f'getting company size and industry for: {company_name}, {company_profile}')
company_size, company_industry = get_company_size_and_industry(driver, f'{company_profile}/about/')
print(f'company size and industry results for: {company_name}, {company_profile} is company_size: {company_size}, company_industry: {company_industry}')
company_details[COMPANY_SIZE_COLUMN] = company_size
company_details[COMPANY_INDUSTRY_COLUMN] = company_industry
return company_details
def start():
"""
This function is the starting point of the program.
It performs the following steps:
- Starts the timer.
- Starts the driver.
- Opens the stat JSON file and loads its contents.
- Prints a message indicating the sheets data to be fetched.
- Fetches the sheet data using the get_sheets_data() function.
- Prints a message indicating the successful fetching of sheet data.
- Retrieves the length of the sheet data.
- Retrieves the row start and max count per cycle from the stat dictionary.
- Iterates through slices of the sheet data.
- Adds company names and name-profile mappings to sets and dictionaries.
- Prints a message indicating the current slice of sheet data.
- Iterates through the company names and processes each company.
- Cancels the timer and retrieves the company data from the local company map.
- Updates the local company map with default values if no data is found.
- Updates the row start value.
- Prints a message indicating the local company map and the number of rows processed.
- Updates the sheet with the local company map if it is not empty.
- Writes the updated stat dictionary to the stat JSON file.
- Raises an exception for testing purposes.
- Catches any exception and prints an error message.
- Stops the timer.
- Quits the driver.
- Prints a message indicating the local company map and the number of rows processed.
- Updates the sheet with the local company map if it is not empty.
- Writes the updated stat dictionary to the stat JSON file.
"""
try:
start = time.time()
driver = start_driver()
try:
with open(STAT_FILE_NAME, 'r') as stat_json_file:
stat = json.load(stat_json_file)
except FileNotFoundError:
stat = {"row_start": 1, "max_count_per_cycle": 5}
print(f'\n\nGetting sheets data for SHEETS_FILE_ID: {SHEETS_FILE_ID}, SHEET_NAME: {SHEET_NAME}, column: {COMPANY_NAME_COLUMN} to {COMPANY_INDUSTRY_COLUMN}')
sheet_data = get_sheets_data(sheet_service, SHEETS_FILE_ID, SHEET_NAME, f'{COMPANY_NAME_COLUMN}:{COMPANY_INDUSTRY_COLUMN}')
print('fetched sheet data')
sheet_len = len(sheet_data)
row_start = stat.get('row_start', 1)
max_count_per_cycle = stat.get('max_count_per_cycle', 10)
while(sheet_len > row_start):
company_names = set()
local_company_map = {}
name_profile_map = {}
row_end = min(sheet_len, row_start+max_count_per_cycle)
sheet_data_slice = sheet_data[row_start:row_end]
print(f'\n\nsheet_data_slice: row_start: {row_start}, row_end: {row_end}, data: {sheet_data_slice}')
# Iterate through the values and add them to the set
for row in sheet_data_slice:
row_len = len(row)
if row and not row_len > 2: # Check for empty cells
company_names.add(row[0])
name_profile_map[row[0]] = row[1] if row_len == 2 else None
print(f'\n\nunique company_names: {company_names}\n\n')
for company_name in company_names:
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(DEFAULT_FUNCTION_TIMEOUT)
try:
local_company_map[company_name] = get_company_details(driver, company_name, name_profile_map)
finally:
# Cancel the timer
signal.alarm(0)
company_data = local_company_map.get(company_name)
if not company_data:
local_company_map[company_name] = {}
local_company_map[company_name][LINKEDIN_PROFILE_COLUMN] = company_data.get(LINKEDIN_PROFILE_COLUMN, 'NA')
local_company_map[company_name][COMPANY_SIZE_COLUMN] = company_data.get(COMPANY_SIZE_COLUMN, 'NA')
local_company_map[company_name][COMPANY_INDUSTRY_COLUMN] = company_data.get(COMPANY_INDUSTRY_COLUMN, 'NA')
row_start = row_end
end = time.time()
print(f'\n\nlocal_company_map: {local_company_map}, rows_processed: {row_end} in {end - start} seconds\n\n')
if local_company_map:
print(f'\n\nUpdating sheet: {SHEETS_FILE_ID}')
update_sheet(sheet_service, SHEETS_FILE_ID, SHEET_ID, sheet_data, local_company_map)
print(f'Updated the sheet: {SHEETS_FILE_ID}')
stat = {'row_start': row_start, 'max_count_per_cycle': max_count_per_cycle}
stat_json_object = json.dumps(stat, indent=4)
with open(STAT_FILE_NAME, 'w') as stat_file:
stat_file.write(stat_json_object)
except Exception as ex:
end = time.time()
print(f'\n\n-------> in {end - start} seconds. Ex: {ex}\n\n')
finally:
quit_driver(driver)
print(f'\n\nlocal_company_map: {local_company_map}, rows_processed: {row_end} in {end - start} seconds\n\n')
if local_company_map:
print(f'\n\nUpdating sheet: {SHEETS_FILE_ID}')
update_sheet(sheet_service, SHEETS_FILE_ID, SHEET_ID, sheet_data, local_company_map)
print(f'Updated the sheet: {SHEETS_FILE_ID}')
stat = {'row_start': row_start, 'max_count_per_cycle': max_count_per_cycle}
stat_json_object = json.dumps(stat, indent=4)
with open(STAT_FILE_NAME, 'w') as stat_file:
stat_file.write(stat_json_object)
if __name__ == '__main__':
start()