-
Notifications
You must be signed in to change notification settings - Fork 0
/
Diabetes_Feature_Engineering.py
557 lines (429 loc) · 21.9 KB
/
Diabetes_Feature_Engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
"""
Author : Mustafa Gürkan Çanakçi
LinkedIn : https://www.linkedin.com/in/mgurkanc/
"""
# Project Name : Diabet Feature Engineering
# Business Problem
# In this project, we will develop a machine learning model that
# can predict whether Pima Indian Women in the dataset have diabetes or not.
# Before modelling , we will make the exploratory data analysis and feature engineering for its dataset.
# Content of Variables:
# Pregnancies - Number of pregnancies
# Glucose - 2-hour plasma glucose concentration in the oral glucose tolerance test
# BloodPressure - Diastolic Blood Pressure
# SkinThickness - Thickness of Skin
# Insulin- 2-hour serum insulin
# DiabetesPedigreeFunction -
# BMI - Body Mass Index
# Age - Age
# Outcome - Diabetic ( 1 or 0 )
########################################################################################
# 1.EXPLORATORY DATA ANALYSIS #
########################################################################################
# * 1.1.Importing necessary libraries*
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
# !pip install missingno
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)
######################
# * 1.2.Read the dataset*
######################
df = pd.read_csv("datasets/diabetes.csv")
# * Checking the data*
def check_data(dataframe,head=5):
print(20*"-" + "Information".center(20) + 20*"-")
print(dataframe.info())
print(20*"-" + "Data Shape".center(20) + 20*"-")
print(dataframe.shape)
print("\n" + 20*"-" + "The First 5 Data".center(20) + 20*"-")
print(dataframe.head())
print("\n" + 20 * "-" + "The Last 5 Data".center(20) + 20 * "-")
print(dataframe.tail())
print("\n" + 20 * "-" + "Missing Values".center(20) + 20 * "-")
print(dataframe.isnull().sum())
print("\n" + 40 * "-" + "Describe the Data".center(40) + 40 * "-")
print(dataframe.describe([0.01, 0.05, 0.10, 0.50, 0.75, 0.90, 0.95, 0.99]).T)
check_data(df)
# Conclusion:
# There are only numerical variables in this dataset.
# 768 observations, 9 variable available(1 dependent)
# Under normal circumstances, it seems that there are no missing valuesin the data set,
# but there may be missing values hidden in the data of the variables here.
###############################################
# * 1.3. Checking the missing values in the dataset*
###############################################
dimension_variable = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
df[dimension_variable] = df[dimension_variable].replace(0,np.NaN)
df.isnull().sum()
# Pregnancies 0
# Glucose 5
# BloodPressure 35
# SkinThickness 227
# Insulin 374
# BMI 11
# DiabetesPedigreeFunction 0
# Age 0
# Outcome 0
####################################################################################
# * 1.4.Define a Function to grab the Numerical and Categorical variables of its dataset*
####################################################################################
def grab_col_names(dataframe, cat_th=10, car_th=20):
"""
Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenler de dahildir.
Parameters
------
dataframe: dataframe
Değişken isimleri alınmak istenilen dataframe
cat_th: int, optional
numerik fakat kategorik olan değişkenler için sınıf eşik değeri
car_th: int, optinal
kategorik fakat kardinal değişkenler için sınıf eşik değeri
Returns
------
cat_cols: list
Kategorik değişken listesi
num_cols: list
Numerik değişken listesi
cat_but_car: list
Kategorik görünümlü kardinal değişken listesi
Examples
------
import seaborn as sns
df = sns.load_dataset("iris")
print(grab_col_names(df))
Notes
------
cat_cols + num_cols + cat_but_car = toplam değişken sayısı
num_but_cat cat_cols'un içerisinde.
Return olan 3 liste toplamı toplam değişken sayısına eşittir: cat_cols + num_cols + cat_but_car = değişken sayısı
"""
# cat_cols, cat_but_car
cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
dataframe[col].dtypes != "O"]
cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
dataframe[col].dtypes == "O"]
cat_cols = cat_cols + num_but_cat
cat_cols = [col for col in cat_cols if col not in cat_but_car]
# num_cols
num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
num_cols = [col for col in num_cols if col not in num_but_cat]
print(f"Observations: {dataframe.shape[0]}")
print(f"Variables: {dataframe.shape[1]}")
print(f'cat_cols: {len(cat_cols)}')
print(f'num_cols: {len(num_cols)}')
print(f'cat_but_car: {len(cat_but_car)}')
print(f'num_but_cat: {len(num_but_cat)}')
return cat_cols, num_cols, cat_but_car
cat_cols, num_cols, cat_but_car = grab_col_names(df)
# Observations: 768
# Variables: 9
# cat_cols: 1
# num_cols: 8
# cat_but_car: 0
# num_but_cat: 1
cat_cols
# Out[14]: ['Outcome']
num_cols
# ['Pregnancies',
# 'Glucose',
# 'BloodPressure',
# 'SkinThickness',
# 'Insulin',
# 'BMI',
# 'DiabetesPedigreeFunction',
# 'Age']
#####################################
# * 1.5.Target Variable Analysis
#####################################
df["Outcome"].value_counts()
# 0 500
# 1 268
# Name: Outcome, dtype: int64
def target_summary_with_num(dataframe,target, numerical_col):
print(dataframe.groupby(target).agg({numerical_col:"mean"}), end="\n\n")
print("###################################")
for col in num_cols:
target_summary_with_num(df,"Outcome",col)
###############################################
# * 1.6.Outliers Analysis
###############################################
# Define a Function about outlier threshold for data columns
def outlier_th(dataframe, col_name, q1=0.25, q3=0.75):
quartile1 = dataframe[col_name].quantile(q1)
quartile3 = dataframe[col_name].quantile(q3)
interquantile_range = quartile3 - quartile1
up_limit = quartile3 + 1.5 * interquantile_range
low_limit = quartile1 - 1.5 * interquantile_range
return low_limit, up_limit
# Define a Function about checking outlier for data columns
def check_outlier(dataframe, col_name):
low_limit, up_limit = outlier_th(dataframe, col_name)
if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
return True
else:
return False
# Define a Function about replace with threshold for data columns
def replace_with_thresholds(dataframe, variable):
low_limit, up_limit = outlier_th(dataframe, variable)
dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
for col in num_cols:
print(col, check_outlier(df, col))
# Pregnancies True
# Glucose False
# BloodPressure True
# SkinThickness True
# Insulin True
# BMI True
# DiabetesPedigreeFunction True
# Age True
for col in num_cols:
replace_with_thresholds(df, col)
for col in num_cols:
print(col, check_outlier(df, col))
# Pregnancies False
# Glucose False
# BloodPressure False
# SkinThickness False
# Insulin False
# BMI False
# DiabetesPedigreeFunction False
# Age False
################################
# * 1.7.The Missing Values Analysis
################################
# Define a Function about missing values for dataset columns
def missing_values_table(dataframe, na_name=False):
na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
print(missing_df, end="\n")
if na_name:
return na_columns
missing_values_table(df)
# n_miss ratio
# Insulin 374 48.700
# SkinThickness 227 29.560
# BloodPressure 35 4.560
# BMI 11 1.430
# Glucose 5 0.650
#########################
# * Correlation Analysis
#########################
dimension_variable = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
corr_matrix = df[dimension_variable].corr()
corr_matrix
# Glucose BloodPressure SkinThickness Insulin BMI
# Glucose 1.000 0.225 0.217 0.614 0.235
# BloodPressure 0.225 1.000 0.241 0.115 0.295
# SkinThickness 0.217 0.241 1.000 0.200 0.675
# Insulin 0.614 0.115 0.200 1.000 0.266
# BMI 0.235 0.295 0.675 0.266 1.000
# Insulin - Glucose arasında pozitif yönlü ilişki (yüksek)
# BMI- SkinThickness arasında pozitif yönlü ilişki (yüksek)
# BMI - BloodPressure arasında pozitif yönlü ilişki(orta)
# Visulization of Correlation Matrix
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
fig, ax = plt.subplots()
heatmap = ax.imshow(corr_matrix, interpolation='nearest', cmap=cm.coolwarm)
# making the colorbar on the side
cbar_min = corr_matrix.min().min()
cbar_max = corr_matrix.max().max()
cbar = fig.colorbar(heatmap, ticks=[cbar_min, cbar_max])
# making the labels
labels = ['']
for column in dimension_variable:
labels.append(column)
labels.append('')
ax.set_yticklabels(labels, minor=False)
ax.set_xticklabels(labels, minor=False)
plt.show(block=True)
########################################################################################
# 2.FEATURE ENGINEERING #
########################################################################################
###############################################
# * 2.1.Processing for Missing Values and Outliers
###############################################
df.isnull().sum()
# Out[46]:
# Pregnancies 0
# Glucose 5
# BloodPressure 35
# SkinThickness 227
# Insulin 374
# BMI 11
# DiabetesPedigreeFunction 0
# Age 0
# Outcome 0
na_cols = missing_values_table(df, True)
# n_miss ratio
# Insulin 374 48.700
# SkinThickness 227 29.560
# BloodPressure 35 4.560
# BMI 11 1.430
# Glucose 5 0.650
# Define a Function about comparing target variable with missing values
def missing_vs_target(dataframe, target, na_columns):
temp_df = dataframe.copy()
for col in na_columns:
temp_df[col + '_NA_FLAG'] = np.where(temp_df[col].isnull(), 1, 0)
na_flags = temp_df.loc[:, temp_df.columns.str.contains("_NA_")].columns
for col in na_flags:
print(pd.DataFrame({"TARGET_MEAN": temp_df.groupby(col)[target].mean(),
"Count": temp_df.groupby(col)[target].count()}), end="\n\n\n")
missing_vs_target(df, "Outcome", na_cols)
# Conclusion:
# We examined the missing values of each variable according to the target variable.
# So we decided to apply different methods in order to fill na values according to state of each variable.
# Fill the missing values of some variables with the median
df["Glucose"] = df["Glucose"].fillna(df["Glucose"].median())
df["BloodPressure"] = df["BloodPressure"].fillna(df["BloodPressure"].median())
df["BMI"] = df["BMI"].fillna(df["BMI"].median())
# Fill the missing values of "Insulin" and "SkinThickness variables by implementing the KNN method
dff = pd.get_dummies(df[["Insulin","SkinThickness"]], drop_first=True)
dff.head()
# # Standardization of variables
scaler = MinMaxScaler()
dff = pd.DataFrame(scaler.fit_transform(dff), columns=dff.columns)
dff.head()
# # Implement the KNN method
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
dff = pd.DataFrame(imputer.fit_transform(dff), columns=dff.columns)
dff.head()
# # Undo the standardization of these variables
dff = pd.DataFrame(scaler.inverse_transform(dff), columns=dff.columns)
dff.head()
df["Insulin"] = dff["Insulin"]
df["SkinThickness"]= dff["SkinThickness"]
df.isnull().sum()
###############################################
# * 2.2.Creating New Feature Interactions
###############################################
df.head()
# # Create a Glucose Categorical variable
df.loc[(df['Glucose'] < 70), 'GLUCOSE_CAT'] ="hipoglisemi"
df.loc[(df['Glucose'] >= 70) & (df['Glucose'] < 100) , 'GLUCOSE_CAT'] ="normal"
df.loc[(df['Glucose'] >= 100) & (df['Glucose'] < 126) , 'GLUCOSE_CAT'] ="imparied glucose"
df.loc[(df['Glucose'] >= 126), 'GLUCOSE_CAT'] ="hiperglisemi"
df.head()
df.groupby("GLUCOSE_CAT").agg({"Outcome": ["mean","count"]})
df.groupby("Outcome").agg({"Age":"mean"})
# # Create the Age Categorical variable
df.loc[(df['Age'] >= 18) & (df['Age'] < 30) , 'AGE_CAT'] ="young_women_"
df.loc[(df['Age'] >= 30) & (df['Age'] < 45) , 'AGE_CAT'] ="mature_women"
df.loc[(df['Age'] >= 45) & (df['Age'] < 65) , 'AGE_CAT'] ="middle_age"
df.loc[(df['Age'] >= 65) & (df['Age'] < 75) , 'AGE_CAT'] ="old_age"
df.loc[(df['Age'] >= 75) , 'AGE_CAT'] ="elder_age"
df.groupby("AGE_CAT").agg({"Outcome": ["mean","count"]})
# Outcome
# mean count
# AGE_CAT
# mature_adult 0.494 239
# middle_age 0.530 117
# old_age 0.250 16
# young_adult 0.212 396
# # Create a Body Mass Index(BMI) Categorical variable
df.loc[(df['BMI'] < 16), 'BMI_CAT'] ="overweak"
df.loc[(df['BMI'] >= 16) & (df['BMI'] < 18.5) , 'BMI_CAT'] ="weak"
df.loc[(df['BMI'] >= 18.5) & (df['BMI'] < 25) , 'BMI_CAT'] ="normal"
df.loc[(df['BMI'] >= 25) & (df['BMI'] < 30) , 'BMI_CAT'] ="overweight"
df.loc[(df['BMI'] >= 30) & (df['BMI'] < 35) , 'BMI_CAT'] ="1st_Obese"
df.loc[(df['BMI'] >= 35) & (df['BMI'] < 45) , 'BMI_CAT'] ="2nd_Obese"
df.loc[(df['BMI'] >= 45), 'BMI_CAT'] ="3rd_Obese"
df.groupby("BMI_CAT").agg({"Outcome": ["mean","count"]})
# Outcome
# mean count
# BMI_CAT
# 1stObese 0.438 235
# 2ndObese 0.453 212
# 3rdObese 0.611 36
# normal 0.069 102
# overweight 0.223 179
# weak 0.000 4
df.head()
# # Create a Diastolic Blood Pressure Categorical variable
df.loc[(df['BloodPressure'] < 70) , 'DIASTOLIC_CAT'] ="low"
df.loc[(df['BloodPressure'] >= 70) & (df['BMI'] < 90) , 'DIASTOLIC_CAT'] ="normal"
df.loc[(df['BloodPressure'] >= 90 ) , 'DIASTOLIC_CAT'] ="high"
df.groupby("DIASTOLIC_CAT").agg({"Outcome": ["mean","count"]})
df["Insulin"].unique()
# # Create a Insulin Categorical variable
df.loc[(df['Insulin'] < 120) , 'INSULIN_CAT'] ="normal"
df.loc[(df['Insulin'] >= 120) , 'INSULIN_CAT'] ="abnormal"
df.groupby("INSULIN_CAT").agg({"Outcome": ["mean","count"]})
df.head()
df["Pregnancies"].value_counts()
# # Create a Pregnancies Categorical variable
df.loc[(df['Pregnancies'] == 0) , 'PREG_CAT'] ="unpregnant"
df.loc[(df['Pregnancies'] > 0 ) & (df['Pregnancies'] <= 5) , 'PREG_CAT'] ="normal"
df.loc[(df['Pregnancies'] > 5 ) & (df['Pregnancies'] <= 10 ) , 'PREG_CAT'] ="high"
df.loc[(df['Pregnancies'] > 10 ) , 'PREG_CAT'] ="very high"
df.groupby("PREG_CAT").agg({"Outcome": ["mean","count"]})
# Out[43]:
# Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome GLUCOSE_CAT AGE_CAT BMI_CAT DIASTOLIC_CAT INSULIN_CAT PREG_CAT
# 0 6.000 148.000 72.000 35.000 NaN 33.600 0.627 50.000 1 hiperglisemi middle_age 1st_Obese normal NaN high
# 1 1.000 85.000 66.000 29.000 NaN 26.600 0.351 31.000 0 normal mature_women overweight low NaN normal
# 2 8.000 183.000 64.000 NaN NaN 23.300 0.672 32.000 1 hiperglisemi mature_women normal low NaN high
# 3 1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0 normal young_women_ overweight low normal normal
# 4 0.000 137.000 40.000 35.000 168.000 43.100 1.200 33.000 1 hiperglisemi mature_women 2nd_Obese low abnormal unpregnant
df.head()
###############################################
# * 2.3.Processing Encoding and One-Hot Encoding
###############################################
le = LabelEncoder()
binary_cols = [col for col in df.columns if df[col].dtype not in [int, float]
and df[col].nunique() == 2]
def label_encoder(dataframe, binary_col):
labelencoder = LabelEncoder()
dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
return dataframe
for col in binary_cols:
df = label_encoder(df, col)
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
return dataframe
ohe_cols = [col for col in df.columns if 10 >= df[col].nunique() > 2]
df = one_hot_encoder(df, ohe_cols)
df.head()
# Out[48]:
# Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome GLUCOSE_CAT_hipoglisemi GLUCOSE_CAT_imparied glucose GLUCOSE_CAT_normal AGE_CAT_middle_age AGE_CAT_old_age AGE_CAT_young_women_ BMI_CAT_2nd_Obese BMI_CAT_3rd_Obese BMI_CAT_normal BMI_CAT_overweight BMI_CAT_weak DIASTOLIC_CAT_low DIASTOLIC_CAT_normal INSULIN_CAT_1 INSULIN_CAT_2 PREG_CAT_normal PREG_CAT_unpregnant PREG_CAT_very high
# 0 6.000 148.000 72.000 35.000 NaN 33.600 0.627 50.000 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0
# 1 1.000 85.000 66.000 29.000 NaN 26.600 0.351 31.000 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0
# 2 8.000 183.000 64.000 NaN NaN 23.300 0.672 32.000 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0
# 3 1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0
# 4 0.000 137.000 40.000 35.000 168.000 43.100 1.200 33.000 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0
###############################################
# * 2.4.Standardization for numerical variables
###############################################
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df[num_cols].head()
df.head()
###############################################
# * 2.5.Create Modelling
###############################################
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_pred, y_test)
# Out[81]: 0.7705627705627706