-
Notifications
You must be signed in to change notification settings - Fork 1
/
clean.py
447 lines (420 loc) · 14.2 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
""" Thesis - Data Cleaning and Encoding """
"""
This script takes the raw ACS NSQIP surgery data and produces
a clean csv file that is ready for computation and analysis.
It handles data cleaning, encoding, and preprocessing.
"""
### Imports ###
import pandas as pd
import numpy as np
### Functions ###
# clean_df
def clean_df(df):
"""
Parameters
----------
df : pandas dataframe
dataframe to be cleaned.
Returns
-------
df : pandas dataframe
cleaned dataframe.
-------
This function cleans the passed dataframe by dropping unwanted variables,
encoding null values, making changes to specific variables, and dropping
unecessary RVU variables.
"""
# Opening unwanted variables file and reading them into array
## Note: this needs to be switched to unwanted_vars_full when using the full data file
f = open('data/unwanted_vars_full.txt')
unwanted = []
for line in f:
# Getting rid of newline character
line = line[:-1]
unwanted.append(line)
# Dropping unwanted variables
df = df.drop(unwanted, axis=1)
# -99, NULL, Unknown, and Unknown/Not Reported represent missing values, changing to nan
df = df.replace(-99, np.nan)
df = df.replace('-99', np.nan)
df = df.replace('NULL', np.nan)
df = df.replace('Unknown', np.nan)
df = df.replace('Unknown/Not Reported', np.nan)
# Making changes to some categorical variables to match user guide
# Specifically for age (for creating age categories)
df = df.replace('90+', 90)
# Specifically for transfer status
df = df.replace('Admitted directly from home','Not transferred (admitted from home)')
df = df.replace('Acute Care Hospital','From acute care hospital inpatient')
df = df.replace('Chronic Care Facility','Nursing home - Chronic care - Intermediate care')
df = df.replace('VA Chronic Care Facility','Nursing home - Chronic care - Intermediate care')
df = df.replace('VA Acute Care Hospital','From acute care hospital inpatient')
df['transt'] = df['transt'].replace('Other','Transfer from other')
# Specifically for anesthesia
df = df.replace('MAC/IV Sedation','Monitored Anesthesia care (MAC) / IV Sedation')
df = df.replace('Monitored Anesthesia Care','Monitored Anesthesia care (MAC) / IV Sedation')
# Specifically for diabetes
df = df.replace('ORAL','NON-INSULIN')
# Specifically for othbleed
df = df.replace('Bleeding/Transfusions','Transfusions/Intraop/Postop')
# Changing age to float type for encoding
df['age'] = df['age'].astype(float)
# Dropping any remaining RVU columns as they are unnecessary
for column in df:
if 'rvu' in column:
df = df.drop(column, axis=1)
return df
# encode_vars
def encode_vars(df):
"""
Parameters
----------
df : pandas dataframe
dataframe to be encoded.
Returns
-------
df : pandas dataframe
encoded dataframe.
-------
This function encodes some of the variables in the dataframe by creating
age categories, encoding binary variables, dealing with specific-case binary variables,
encoding postop complication variables, dropping number of occurrances variables,
creating dummy variables, replacing spaces, and renaming some fringe cases.
"""
# Creating caegories for age using a helper function
df['age_cat'] = df['age'].apply(age_categories)
# List of postop variables
postop = ['supinfec','wndinfd','orgspcssi','dehis','oupneumo','reintub',
'pulembol','failwean','renainsf','oprenafl','urninfec','cnscva','cdarrest',
'cdmi','othbleed','othdvt','othsysep','othseshock']
# Encoding non-postop variables into 1/0
for var in df:
# Changing Yes/No to 1/0
if df[var].nunique() == 2 and (df[var].dtype == object or df[var].dtype == str) and var != 'sex' and var != 'inout' and var not in postop:
df[var] = df[var].map({'Yes':1,'No':0})
elif df[var].nunique() == 1 and (df[var].dtype == object or df[var].dtype == str) and var != 'sex' and var != 'inout' and var not in postop:
df[var] = df[var].map({'Yes':1,'No':0})
# Dealing with other binary variables - gender and in/out status
elif var == 'sex':
df[var] = df[var].map({'male':1,'female':0})
elif var == 'inout':
df[var] = df[var].map({'Outpatient':1,'Inpatient':0})
# Encoding postop complications into 1/0
for var in postop:
# Encoding the variables using a helper function
df[var] = df[var].apply(encode_postop)
# Dropping the number of occurrances variables (not relavent)
num_var = 'n' + var
df = df.drop(num_var, axis=1)
# Creating dummy variables (one-hot encoding)
df = pd.get_dummies(df,columns=['race_new','transt','dischdest','anesthes','surgspec','diabetes',
'dyspnea','fnstatus2','prsepis','wndclas','asaclas','age_cat']
,prefix=['race','transt','dischdest','anesthes','surgspec','diabetes',
'dyspnea','fnstatus','sepsis','wnd','asa','age_cat'])
# Replacing spaces/underscores in dummy variables
df.columns = [c.lower().replace(' ', '_') for c in df.columns]
df.columns = [c.replace('_-_','_') for c in df.columns]
df.columns = [c.replace('-','_') for c in df.columns]
# Editing some fringe cases to match user guide
df = df.rename(columns={'transt_not_transferred_(admitted_from_home)':'transt_not_transferred',
'dischdest_against_medical_advice_(ama)':'dischdest_against_medical_advice',
'anesthes_monitored_anesthesia_care_(mac)_/_iv_sedation':'anesthes_mac',
'surgspec_otolaryngology_(ent)':'surgspec_otolaryngology_ent',
'age_cat_85+':'age_cat_85','age_cat_<65':'age_cat_65',
'dischdest_skilled_care,_not_home':'dischdest_skilled_care_not_home'})
return df
# age_categories
def age_categories(age):
"""
Parameters
----------
age : float
age as an float.
Returns
-------
str
categorization of age.
-------
This helper function is used to create age categories that match the user guide.
"""
if age == 'nan':
return np.nan
else:
#age = age.astype(int)
if age < 65:
return '<65'
elif 65 <= age < 75:
return '65_75'
elif 75 <= age < 85:
return '75_85'
elif age >= 85:
return '85+'
# encode_postop
def encode_postop(var):
"""
Parameters
----------
var : postop variable (string)
postop description.
Returns
-------
int
encoded postop description.
-------
This helper function is used to encode the postop complication variables.
"""
if var == 'No Complication':
return 0
elif var == np.nan:
return np.nan
else:
return 1
# aggregate response variables
def aggregate_response(df):
"""
Parameters
----------
df : pandas dataframe
dataframe to be aggregated.
Returns
-------
df : pandas dataframe
aggregated dataframe.
-------
This function is for aggregating the postoperative complication response variables
that are made up of multiple complications or need to be converted from a numeric value.
"""
# Aggregating mortality from numeric "days from operation until death" variable
df['mortality'] = df['dopertod'].apply(lambda x: 1 if x >= 0 else 0)
# Note: all of the following employ helper functions
# Aggregating cardiac event (cardiac arrest or myocardial infarction)
df['cardiac'] = df.apply(aggregate_cardiac, axis=1)
# Aggregating renal failure
df['renal'] = df.apply(aggregate_renal, axis=1)
# Aggregating SSI
df['ssi'] = df.apply(aggregate_ssi, axis=1)
# Aggregating morbidity
df['morbidity'] = df.apply(aggregate_morbidity, axis=1)
# Dropping the variables used for aggregation
df = df.drop(['cdarrest','cdmi','renainsf','oprenafl','supinfec','orgspcssi','wndinfd',
'dehis','pulembol','failwean','cnscva','othsysep','othseshock','dopertod'],
axis=1)
return df
# aggregate_cardiac
def aggregate_cardiac(row):
"""
Parameters
----------
row : pandas dataframe row
row to be aggregarted for cardiac outcome.
Returns
-------
int
encoded cardiac outcome aggregation.
-------
This is a helper function for cardiac variable aggregation.
"""
if row.cdarrest == 1 or row.cdmi == 1:
return 1
else:
return 0
# aggregate_renal
def aggregate_renal(row):
"""
Parameters
----------
row : pandas dataframe row
row to be aggregarted for renal outcome.
Returns
-------
int
encoded renal outcome aggregation.
-------
This is a helper function for renal failure variable aggregation.
"""
if row.renainsf == 1 or row.oprenafl == 1:
return 1
else:
return 0
# aggregate_ssi
def aggregate_ssi(row):
"""
Parameters
----------
row : pandas dataframe row
row to be aggregarted for ssi outcome.
Returns
-------
int
encoded ssi outcome aggregation.
-------
This is a helper function for ssi variable aggregation.
"""
if row.supinfec == 1 or row.orgspcssi == 1 or row.wndinfd == 1:
return 1
else:
return 0
# aggregate_morbidity
def aggregate_morbidity(row):
"""
Parameters
----------
row : pandas dataframe row
row to be aggregarted for morbidity outcome.
Returns
-------
int
encoded morbidity outcome aggregation.
-------
This is a helper function for morbidity variable aggregation.
"""
morb = ['cdarrest','cdmi','renainsf','oprenafl','supinfec','orgspcssi','wndinfd',
'dehis','oupneumo','reintub','pulembol','failwean','urninfec','cnscva','othdvt',
'othsysep','othseshock']
is_morb = 0
for var in morb:
# If one of the complications under morbidity occurred, then flag in morbidity column
if row[var] == 1:
is_morb = 1
return is_morb
# get_bmi
def get_bmi(df):
"""
Parameters
----------
df : pandas dataframe
dataframe before BMI categories.
Returns
-------
df : pandas dataframe
dataframe with BMI categories.
------
This function calculates BMI for an individual based on height and weight.
"""
# Calculating BMI
df['bmi'] = (703*df['weight'])/(df['height']**2)
# Getting BMI class using helper function
df['bmi_class'] = df['bmi'].apply(bmi_classes)
# Getting dummy variables for each BMI class
df = pd.get_dummies(df,columns=['bmi_class'],prefix=['bmi'])
return df
# bmi_classes
def bmi_classes(bmi):
"""
Parameters
----------
bmi : float
BMI.
Returns
-------
str
BMI category.
-------
This helper function creates BMI categories that match the user guide.
"""
if bmi < 18.5:
return 'underweight'
elif bmi >= 18.5 and bmi < 25:
return 'normal'
elif bmi >= 25 and bmi < 30:
return 'overweight'
elif bmi >= 30 and bmi < 35:
return 'obese_1'
elif bmi >= 35 and bmi < 40:
return 'obese_2'
else:
return 'obese_3'
# round_df
def round_df(df):
"""
Parameters
----------
df : pandas dataframe
dataframe to be rounded.
Returns
-------
df : pandas dataframe
rounded dataframe.
-------
This function rounds any values to match the user guide.
"""
# Rounding imputed binary variables
df = df.round({'sex':0,'ethnicity_hispanic':0,'stillinhosp':0,'ventilat':0,'hxcopd':0,
'ascites':0,'hxchf':0,'hypermed':0,'renafail':0,'dialysis':0,'discancr':0,
'wndinf':0,'steroid':0,'wtloss':0,'bleeddis':0,'transfus':0,'returnor':0})
for column in df:
# Rounding imputed PATOS binary variables
if 'patos' in column:
df = df.round({column:0})
# Rounding continuous variables to the correct number of decimal places
elif 'pr' in column:
df = df.round({column:1})
# Elective surgery and sex are special cases and employ a helper function
elif column == 'electsurg':
df.loc[:,'electsurg'] = df.loc[:,'electsurg'].apply(round_elec)
elif column == 'sex':
df.loc[:,'sex'] = df.loc[:,'sex'].apply(round_elec)
return df
# round_elec
def round_elec(elec):
"""
Parameters
----------
elec : float
elective surgery value to be rounded after imputation.
Returns
-------
float
proper elective surgery value after mistakes fixed.
-------
This helper function is used for elec/sex as they are binary variables.
"""
if elec <= 0:
return 0.0
elif elec > 0 and elec <= 0.5:
return 0.0
elif elec > 0.5 and elec <= 1:
return 1.0
elif elec >= 1:
return 1.0
# clean
def clean():
"""
Returns
-------
None.
-------
This function calls all of the others to take a raw NSQIP dataset and
transform it into a clean csv file.
"""
# Reading csv
surg = pd.read_csv('data/nsqip_09_17.csv', encoding='latin1')
# Options for displaying output in pandas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)
# Cleaning
surg = clean_df(surg)
print(surg['ethnicity_hispanic'].unique())
print(surg['transt'].unique())
print(surg['dischdest'].unique())
print(surg['surgspec'].unique())
# Encoding
surg = encode_vars(surg)
print(surg.nunique())
# Aggregating
surg = aggregate_response(surg)
# Getting BMI
surg = get_bmi(surg)
# Rounding
surg = round_df(surg)
# Output to csv
surg.to_csv('data/surg.csv')
# Seeing if the final list of variables is correct with right number of unique values
f = open('columns.txt','w')
print(surg.nunique(), file=f)
f.close()
### Main ###
if __name__ == '__main__':
clean()