-
Notifications
You must be signed in to change notification settings - Fork 0
/
library.py
397 lines (299 loc) · 17.6 KB
/
library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import numpy as np
import pandas as pd
import wrds
import json
import os
import shutil
import json
from datetime import date
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import roc_auc_score, f1_score, average_precision_score, precision_score, recall_score
def get_CompuStat_WRDS (variables, dataset, db):
"""
Retrieve CompuStat variables and merge them into the ECL dataset.
Args:
variables (str): A string containing the CompuStat variables to retrieve.
dataset (pandas.DataFrame): The ECL dataset.
db: The WRDS Python API Connection object.
Returns:
pandas.DataFrame: The ECL dataset with CompuStat variables added.
"""
# SQL query to extract the specified CompuStat variables.
query = """
SELECT gvkey, datadate FROM comp_na_annual_all.funda
WHERE datafmt = 'STD'
AND indfmt = 'INDL'
AND consol = 'C'
AND popsrc = 'D'
AND datadate BETWEEN '1993-01-01' AND '2023-05-01'
"""
query = query[:27] + ',' + variables + ' ' + query[27:]
# Fetch the data.
try:
wrds_comp = db.raw_sql(query)
except:
print('Connection with CompuStat failed.')
# Convert datadate and gvkey columns to appropriate data types.
dataset['datadate'] = pd.to_datetime(dataset['datadate'], dayfirst=True)
wrds_comp['datadate'] = pd.to_datetime(wrds_comp['datadate'])
dataset['gvkey'] = dataset['gvkey'].astype(float)
wrds_comp['gvkey'] = wrds_comp['gvkey'].astype(float)
# Merge CompuStat and the ECL dataset.
merged_dataset = dataset.merge(wrds_comp, on = ['gvkey', 'datadate'], how = 'left', indicator=True)
left_only_records = len(merged_dataset.loc[merged_dataset['_merge'] == 'left_only'])
print(f'{left_only_records} records in the dataset do not have an accompanying CompuStat record.')
return merged_dataset
def get_CompuStat_local (path, dataset, update):
"""
Retrieve CompuStat data from a local CSV file and merge it with the ECL dataset.
Args:
path (str): The path to the local copy of CompuStat, stored as a .csv file.
dataset (pandas.DataFrame): The ECL dataset.
update (bool): If True, return CompuStat data for an update; otherwise, merge the data into the dataset.
Returns:
pandas.DataFrame: The updated ECL dataset or the CompuStat data for an update.
"""
# Read in the local CompuStat data from the CSV file.
local_comp = pd.read_csv(path, low_memory=False)
# Read in the local CompuStat data from the CSV file.
local_comp['datadate'] = pd.to_datetime(local_comp['datadate'])
local_comp['gvkey'] = local_comp['gvkey'].astype(float)
# Define indices to remove based on screening variables.
indfmt = local_comp.loc[~(local_comp['indfmt'] == 'INDL')].index
consol = local_comp.loc[~(local_comp['consol'] == 'C')].index
popsrc = local_comp.loc[~(local_comp['popsrc'] == 'D')].index
datafmt = local_comp.loc[~(local_comp['datafmt'] == 'STD')].index
datadate = local_comp.loc[(local_comp['datadate'] < '1993-01-01') |
(local_comp['datadate'] > '2023-05-01')].index
# Additionally, remove records that do not correspond with relevant 10k records.
local_comp['src'] = local_comp['src'].fillna('0').astype(int)
src = local_comp.loc[(local_comp['src'].isin([3,4,9,13,14,26,37,43,99,88]))].index
# Additionally, remove records that do not correspond with relevant 10k records.
indices = list(indfmt) + list(consol) + list(popsrc) + list(datafmt) + list(datadate) + list(src)
local_comp = local_comp.drop(indices).reset_index(drop=True)
print('Dropped ' + str(len(indices)) + ' rows from CompuStat based on screening variables')
# return data for update
if update:
return local_comp
# Adjust the datatypes of the variables that we will use to merge CompuStat records and ECL records.
dataset['datadate'] = pd.to_datetime(dataset['datadate'], dayfirst=True)
dataset['gvkey'] = dataset['gvkey'].astype(float)
# Drop duplicate column
local_comp = local_comp.drop('cik', axis=1)
# Merge CompuStat and the ECL dataset.
merged_dataset = dataset.merge(local_comp, on = ['gvkey', 'datadate'], how = 'left', indicator=True)
left_only_records = len(merged_dataset.loc[merged_dataset['_merge'] == 'left_only'])
print(f'{left_only_records} records in the dataset do not have an accompanying CompuStat record.')
return merged_dataset
def compute_features(dataset):
"""
dataset (pandas.DataFrame): A Pandas dataframe containing the ECL dataset including the CompuStat variables.
returns: (1) The ECL dataset with 28 financial ratios added and (2) the column indices of these ratios.
"""
# Calculate 28 financial ratios based on Mai et al. (2019) for DL-based bankruptcy prediction.
dataset['actlct'] = dataset['act'] / dataset['lct']
dataset['apsale'] = dataset['ap'] / dataset['sale']
dataset['cashat'] = dataset['che'] / dataset['at']
dataset['chat'] = dataset['ch'] / dataset['at']
dataset['chlct'] = dataset['ch'] / dataset['lct']
dataset['ebit_dp_at'] = (dataset['ebit']+dataset['dp']) / dataset['at']
dataset['ebitat'] = dataset['ebit'] / dataset['at']
dataset['ebitsale'] = dataset['ebit'] / dataset['sale']
dataset['fat'] = (dataset['dlc'] + (0.5*dataset['dltt'])) / dataset['at']
dataset['invchinvt'] = dataset['invch'] / dataset['invt']
dataset['invtsale'] = dataset['invt'] / dataset['sale']
dataset['lct_ch_at'] = (dataset['lct']-dataset['ch']) / dataset['at']
dataset['lctat'] = dataset['lct'] / dataset['at']
dataset['lctlt'] = dataset['lct'] / dataset['lt']
dataset['lctsale'] = dataset['lct'] / dataset['sale']
dataset['ltat'] = dataset['lt'] / dataset['at']
dataset['log_at'] = np.log(dataset['at'])
dataset['log_sale'] = np.log(dataset['sale'])
dataset['niat'] = dataset['ni'] / dataset['at']
dataset['nisale'] = dataset['ni'] / dataset['sale']
dataset['oiadpat'] = dataset['oiadp'] / dataset['at']
dataset['oiadpsale'] = dataset['oiadp'] / dataset['sale']
dataset['qalct'] = (dataset['act'] - dataset['invt']) / dataset['lct']
dataset['reat'] = dataset['re'] / dataset['at']
dataset['relct'] = dataset['re'] / dataset['lct']
dataset['saleat'] = dataset['sale'] / dataset['at']
dataset['seqat'] = dataset['seq'] / dataset['at']
dataset['wcapat'] = dataset['wcap'] / dataset['at']
# Calculate 28 financial ratios based on Mai et al. (2019) for DL-based bankruptcy prediction.
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
# Store the columns.
predictors = dataset.iloc[:,-28:].columns
return dataset, predictors
def update_config_file(path_config, start_year, end_year, user_agent, demo):
'''
Update the configuration file with new settings.
Args:
path_config (str): The path to the config.json file.
start_year (int): The start year of the filings to crawl.
end_year (int): The end year of the filings to crawl.
user_agent (str): A string containing the user's email, which will be declared to the SEC when collecting the data.
demo (bool): If True, only a limited number of examples will be crawled for demonstration purposes.
Returns:
dict: A copy of the original config file as a dictionary. The updated version overwrites the original config.json file.
'''
# Open the file.
with open(path_config) as fp:
config = json.load(fp)
# Store a copy
copied = config.copy()
# Update settings in the configuration file.
config['edgar_crawler']['filing_types'] = ['10-K', '10-K405', '10-KT', '10KSB', '10KSB40']
config['edgar_crawler']['start_year'] = start_year
config['edgar_crawler']['end_year'] = end_year
config['edgar_crawler']['quarters'] = [1,2,3,4]
config['edgar_crawler']['user_agent'] = user_agent
# Add a list of CIKs to extract to the config file in the demo.
# In the update process, this can be left blank to crawl all data on the website.
if demo:
ciks = ["1318605", "1018724", "77476", "22701"]
config['edgar_crawler']['cik_tickers'] = ciks
# save the adjustments
with open(path_config, 'w') as fp:
json.dump(config,fp)
return copied
def move_extracted_filings(path_records, path_corpus, demo):
'''
Move newly extracted filings to the EDGAR-corpus and return their metadata.
Args:
path_records (str): The path to the newly extracted records.
path_corpus (str): The path to the EDGAR-corpus.
demo (bool): If True, work with existing files for demonstration purposes.
Returns:
pandas.DataFrame: A DataFrame containing metadata for the newly extracted filings.
The code moves the newly extracted filings to the EDGAR-corpus and removes them from the /update folder.
'''
# Create a list to store the meta-data of the newly extracted records in.
meta_data = []
# loop over the records
for file in os.listdir(path_records):
# Extract the year
substring = file.split('10K')[1]
year = substring.split('_')[1]
# Check if the file is new.
# In the demo, we work with existing files.
target = path_corpus + year + '/'
if (not os.path.exists(target + file)) or (demo):
# if file does not exist, move to dataset (except in demo).
if not demo:
shutil.move(path_records + file, target)
# Read in the new file.
with open(target + file, "r") as f:
data = json.load(f)
# Store metadata.
filing = []
cols = ['cik', 'company', 'filing_type', 'filing_date', 'period_of_report', 'state_of_inc', 'state_location', 'sic']
for variable in cols:
filing.append(data[variable])
filing.append('/' + year + '/' + file)
meta_data.append(filing)
# Store the metadata as a DataFrame.
cols.append('filename')
meta_data = pd.DataFrame(meta_data, columns=cols)
# clear the remaining files from the update folder
dataset_path = os.getcwd() + '/update/edgar-crawler/datasets/'
for file in os.listdir(dataset_path):
if os.path.isfile(dataset_path + file):
os.remove(dataset_path + file)
else:
shutil.rmtree(dataset_path + file)
return meta_data
def fuzzy_match_period_of_report(dataset, edgar, compustat, offset = 7):
'''
Match EDGAR and CompuStat records with the same CIK based on the period_of_report within a specified offset.
Args:
dataset (pandas.DataFrame): Result of an outer merge between EDGAR header records and CompuStat records.
edgar (pandas.DataFrame): DataFrame containing EDGAR headers.
compustat (pandas.DataFrame): DataFrame containing CompuStat headers.
offset (int): The maximum number of days that the period_of_report between two records (with the same CIK) may differ. If the difference is smaller, they are matched.
Returns:
pandas.DataFrame: A DataFrame where EDGAR and CompuStat records with the same CIK are matched if their period_of_report lies close to each other (within the offset).
The function extracts records that can be matched and keeps relevant columns, constructs a window for period_of_report matching, and returns a DataFrame with matched records.
'''
# Extract records that can be matched.
compusub = dataset.loc[(~dataset['cik'].isna()) & (dataset['_merge'] == 'left_only')]
edgarsub = dataset.loc[(~dataset['cik'].isna()) & (dataset['_merge'] == 'right_only')]
# Keep relevant columns.
edgarsub = edgarsub[['cik', 'company','period_of_report']]
compusub = compusub[['cik', 'conm','period_of_report']]
# Construct a window for period_of_report matching.
edgarsub['lower_date'] = edgarsub['period_of_report'] + pd.DateOffset(days = offset*-1)
edgarsub['upper_date'] = edgarsub['period_of_report'] + pd.DateOffset(days = offset)
# Match the reports on CIK, keeping only records with dates within the window.
df = compusub.merge(right=edgarsub, on='cik', how="left", indicator=True)
df = df.loc[(df['period_of_report_x'] >= df['lower_date']) & (df['period_of_report_x'] <= df['upper_date'])]
# Adjust the columns for the matched records.
df = df.drop(['lower_date', 'upper_date', '_merge'], axis = 1)
# Get the matched records from CompuStat and EDGAR databases.
compustat_records = pd.merge(compustat.reset_index(), df, left_on = ['cik', 'period_of_report'], right_on = ['cik', 'period_of_report_x'] , how = 'inner').set_index('index')
edgar_records = pd.merge(edgar.reset_index(), df, left_on = ['cik', 'period_of_report'], right_on = ['cik', 'period_of_report_y'] , how = 'inner').set_index('index')
# Adjust the columns for the EDGAR records.
compustat_records['period_of_report'] = compustat_records['period_of_report_y']
compustat_records['period_of_report_compu'] = compustat_records['period_of_report_x']
compustat_records['conm'] = compustat_records['conm_x']
compustat_records = compustat_records.drop(['period_of_report_x', 'period_of_report_y', 'conm_x', 'conm_y', 'company'], axis = 1)
# Adjust the columns for the EDGAR records.
edgar_records['period_of_report'] = edgar_records['period_of_report_y']
edgar_records['company'] = edgar_records['company_x']
edgar_records = edgar_records.drop(['period_of_report_x', 'period_of_report_y', 'company_x', 'company_y', 'conm'], axis = 1)
# Match on CIK/period_of_report.
matched = pd.merge(compustat_records, edgar_records, on=['cik', 'period_of_report'], how='outer', indicator=True)
matched = matched.drop_duplicates()
# Get the indices of the unmatched records in the dataset.
compustat_in_dataset = pd.merge(dataset.reset_index(), matched, left_on = ['cik', 'period_of_report'], right_on = ['cik', 'period_of_report_compu'] , how = 'inner').set_index('index')
edgar_in_dataset = pd.merge(dataset.reset_index(), matched, left_on = ['cik', 'period_of_report'], right_on = ['cik', 'period_of_report'] , how = 'inner').set_index('index')
comp_indices = compustat_in_dataset.index
edgar_indices = edgar_in_dataset.index
# Remove extra column 'matched' from the matched records.
matched = matched.drop('period_of_report_compu', axis = 1)
# Drop the matched records from the dataset.
dataset = dataset.drop(comp_indices)
dataset = dataset.drop(edgar_indices)
# Add the matched records to the dataset.
result = pd.concat([dataset, matched])
return result
def fuzzy_match_name(dataset, edgar, compustat):
'''
Match EDGAR and CompuStat records based on the period_of_report and company name.
Args:
dataset (pandas.DataFrame): Result of an outer merge between EDGAR header records and CompuStat records.
edgar (pandas.DataFrame): DataFrame containing EDGAR headers.
compustat (pandas.DataFrame): DataFrame containing CompuStat headers.
Returns:
pandas.DataFrame: A DataFrame where EDGAR and CompuStat records are matched if their period_of_report is the same and they have the same name.
The function extracts records that can be matched, keeps relevant columns, and returns a DataFrame with matched records.
'''
# Extract records that can be matched.
compusub = dataset.loc[(dataset['_merge'] == 'left_only') & ((dataset['cik'] == 999999))]
edgarsub = dataset.loc[(dataset['_merge'] == 'right_only')]
# Keep relevant columns.
edgarsub = edgarsub[['company','period_of_report']]
compusub = compusub[['conm','period_of_report']]
# Merge the CompuStat and EDGAR records on company name and period_of_report.
matched = pd.merge(left=compusub, right=edgarsub, how='inner', left_on=['conm', 'period_of_report'], right_on=['company', 'period_of_report'], indicator=True)
# Get the matched records from CompuStat and EDGAR databases.
compustat_records = pd.merge(compustat.reset_index(), matched, left_on = ['conm', 'period_of_report'], right_on = ['conm', 'period_of_report'] , how = 'inner', indicator=False).set_index('index')
edgar_records = pd.merge(edgar.reset_index(), matched, left_on = ['company', 'period_of_report'], right_on = ['company', 'period_of_report'] , how = 'inner', indicator=False).set_index('index')
# Adjust the columns for the EDGAR records.
compustat_records = compustat_records.drop(['company', '_merge'], axis = 1)
edgar_records = edgar_records.drop(['conm', '_merge'], axis = 1)
# Match on company name/period_of_report and adjust CIK.
matched = pd.merge(compustat_records, edgar_records, left_on=['conm', 'period_of_report'], right_on=['company', 'period_of_report'], how='outer', indicator=True)
matched['cik'] = matched['cik_y']
matched = matched.drop(['cik_x', 'cik_y'], axis=1)
# Get the indices of the unmatched records.
compustat_in_dataset = pd.merge(dataset.reset_index(), matched, left_on = ['conm', 'period_of_report'], right_on = ['conm', 'period_of_report'] , how = 'inner').set_index('index')
edgar_in_dataset = pd.merge(dataset.reset_index(), matched, left_on = ['company', 'period_of_report'], right_on = ['company', 'period_of_report'] , how = 'inner').set_index('index')
comp_indices = compustat_in_dataset.index
edgar_indices = edgar_in_dataset.index
# Drop the matched records from the dataset.
dataset = dataset.drop(comp_indices)
dataset = dataset.drop(edgar_indices)
# Add the matched records.
result = pd.concat([dataset, matched])
return result