-
Notifications
You must be signed in to change notification settings - Fork 1
/
Q1Feb9
199 lines (162 loc) · 8.07 KB
/
Q1Feb9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/*Test Data - altered column names*/
PROC IMPORT OUT = WORK.test
DATAFILE = '/home/lsterling0/Imports/test.csv'
DBMS=CSV REPLACE;
GETNAMES=YES;
RUN;
/*Train Data - altered column names*/
PROC IMPORT OUT = WORK.train
DATAFILE = '/home/lsterling0/Imports/train.csv'
DBMS=CSV REPLACE;
GETNAMES=YES;
RUN;
/*We need to add a SalePrice column to the test set*/
data test;
set test;
SalePrice = .;
run;
/*We need to look into outliers of the data*/
/*LotFrontage is saved as a character when it should be saved as numeric*/
*We also need to combine test and train into one dataset - train2;
data Train2;
set Train test; *combining the sets together;
LotFrontageNum = input(LotFrontage, 8.); *Creating a new numeric column with values from LotFrontage;
drop LotFrontage; *deleting the character column LotFrontage;
rename LotFrontageNum=LotFrontage; *renaming the numeric column to match the original column name;
run;
/*Let's go ahead and log SalePrice*/
data Train2;
set Train2;
logSalePrice = log(SalePrice);
logGrLivArea = log(GrLivArea);
logLowQualFinSF = log(LowQualFinSF); *Does not help;
SRTotalBsmtSF = sqrt(TotalBsmtSF); *we tried both log and sqrt - neither helped with the cluster;
logPoolArea = log(PoolArea);
logMiscVal = log(MiscVal);
run;
*From here on out we will be using train2;
/*EXPLORATORY ANALYSIS*/
/*Here I separated all of the variables, along with SalePrice, into 4 different matrices because running them all together got really small*/
*MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea;
proc sgscatter data=train2;
matrix logSalePrice MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea;
run;
*LotArea - potential outliers
lotFrontage - data looks off;
proc glm data = train2 plots=all;
model logSalePrice = LotArea;
run; *definite outlier(s);
proc glm data = train2 plots=all;
model logSalePrice = LotFrontage;
run; *looks okay;
*BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF FirstFlrSF SecondFlrSF LowQualFinSF GrLivArea;
proc sgscatter data=train2;
matrix logSalePrice BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF FirstFlrSF SecondFlrSF LowQualFinSF GrLivArea;
run;
proc glm data = train2 plots=all;
model logSalePrice = BsmtFinSF1;
run;
proc glm data = train2 plots=all;
model logSalePrice = logGrLivArea;
run;
proc glm data = train2 plots=all;
model logSalePrice = logLowQualFinSF;
run;
proc glm data = train2 plots=all;
model logSalePrice = SRTotalBsmtSF;
run;
proc glm data = train2 plots=all;
model logSalePrice = logPoolArea;
run;
proc glm data = train2 plots=all;
model logSalePrice = logMiscVal;
run;
*BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces;
proc sgscatter data = train2;
matrix logSalePrice BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces;
run;
*GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch ThreeSsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold;
proc sgscatter data=train2;
matrix logSalePrice GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch ThreeSsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold;
run;
/*Now that we have logged values that seemed to be off, we can start looking for any outliers*/
*I am not sure that proc glm is the best way to look at outliers when there are so many class variables;
proc glm data=Train2 plots = all;
class ; *should we include all?;
*model logSalePrice ; *Use this if logSalePrice looks better;
model SalePrice = /solution; *insert variable being considered;
run;
/*FORWARD SELECTION*/
*Not including any logged variables;
proc glmselect data=train2 plots=all;
class MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Heating HeatingQC CentralAir Electrical KitchenQual Functional FireplaceQu GarageType GarageFinish GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition;
model logSalePrice = MSSubClass logLotFrontage logLotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF FirstFlrSF SecondFlrSF LowQualFinSF logGrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch ThreeSsnPorch ScreenPorch logPoolArea logMiscVal MoSold YrSold / selection=Forward(stop=CV) cvmethod=random(5) stats=adjrsq;
output out = results2 p=predict;
run;
/*Check assumptions for given model*/
*Scatterplot Matrix;
proc sgscatter data=train2;
matrix ;
run;
*Assumption plots and VIF;
proc reg data=train2 plots=all;
model / VIF;
run;
/*Get confidence intervals for final model */
proc glm data = train2 plots = all;
class ;
model /solution clparm;
run;
/*BACKWARD SELECTION*/
*Not including any logged variables;
proc glmselect data=train2 plots=all;
class MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Heating HeatingQC CentralAir Electrical KitchenQual Functional FireplaceQu GarageType GarageFinish GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition;
model logSalePrice = MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF FirstFlrSF SecondFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch ThreeSsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold / selection=Backward(stop=CV) cvmethod=random(5) stats=adjrsq;
output out = results2 p=predict;
run;
/*Check assumptions for given model*/
*Scatterplot Matrix;
proc sgscatter data=train2;
matrix ;
run;
*Assumption plots and VIF;
proc reg data=train2 plots=all;
model / VIF;
run;
/*Get confidence intervals for final model*/
proc glm data = train2 plots = all;
class ;
model /solution clparm;
run;
/*STEPWISE SELECTION*/
*Not including any logged variables;
proc glmselect data=train2 plots=all;
class MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Heating HeatingQC CentralAir Electrical KitchenQual Functional FireplaceQu GarageType GarageFinish GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition;
model SalePrice = MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF FirstFlrSF SecondFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch ThreeSsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold / selection=Stepwise(stop=CV) cvmethod=random(5) stats=adjrsq;
output out = results2 p=predict;
run;
/*Check assumptions for given model*/
*Scatterplot Matrix;
proc sgscatter data=train2;
matrix ;
run;
*Assumption plots and VIF;
proc reg data=train2 plots=all;
model / VIF;
run;
/*Get confidence intervals for final model*/
proc glm data = train2 plots = all;
class ;
model /solution clparm;
run;
/*Use the following code to output the proper file to submi in kaggle*/
data results3;
set results2;
if predict > 0 then SalePrice = exp(Predict); *uncomment if SalePrice is logged;
*if predict > 0 then SalePrice = Predict;
if predict < 0 then SalePrice = 10000; *This gets rid of any negative values;
keep id SalePrice; *we only want to keep ID and SalePrice;
where id > 1460; *We only want to include SalePrice from the empty Test dataset;
run;
proc export data = results3 outfile = _dataout dbms = csv replace;
run;