import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_splitBetter Housing Regression w/ Ensemble Models
Data from https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data
Goal: beat the basic regression done in HousingRegression.ipynb.
Score to beat: .14834 (lower is better)
# Data Preprocessing
trainData = pd.read_csv("datasets/train.csv")
submissionData = pd.read_csv("datasets/test.csv")
yvals = trainData["SalePrice"].copy()
xvals = trainData.drop(columns="SalePrice")
price_bins = pd.qcut(yvals, q=8)
xtrain, xtest, ytrain, ytest = train_test_split(xvals, yvals, test_size=.15, stratify=price_bins)
ytrainLog = np.log1p(ytrain)
ytestLog = np.log1p(ytest)Feature Engineering
def transformX(df):
df["TotalSF"] = df["GrLivArea"] + df["TotalBsmtSF"]
df["QualityArea"] = df["OverallQual"] * df["TotalSF"]
df["TotalBath"] = 0.5*df["HalfBath"] + df["FullBath"] + df["BsmtFullBath"] + 0.5*df["BsmtHalfBath"]
df["QualityScore"] = df["OverallQual"] * df["OverallCond"]
df["BasementRatio"] = df["TotalBsmtSF"] / (df["GrLivArea"]+1)
df["LivingSpaceRatio"] = df["GrLivArea"] / (df["LotArea"]+1)
#
df['HouseAge'] = df['YrSold'] - df['YearBuilt']
df['YearsSinceRemod'] = df['YrSold'] - df['YearRemodAdd']
# df['IsNew'] = (df['YearBuilt'] == df['YrSold']).astype(int)
df['ExterQualNum'] = df['ExterQual'].map({'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1}).fillna(0)
df['KitchenQualNum'] = df['KitchenQual'].map({'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1}).fillna(0)
df['BsmtFinishedRatio'] = (df['BsmtFinSF1'] + df['BsmtFinSF2']) / (df['TotalBsmtSF'] + 1)
df['BsmtFinishedRatio'] = df['BsmtFinishedRatio'].fillna(0)
df['TotalPorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
df['TotalPorchSF'] = np.log1p(df['TotalPorchSF'])
#
df["TotalSF"] = np.log1p(df["GrLivArea"] + df["TotalBsmtSF"])
df["LotArea"] = np.log1p(df["LotArea"])
df["MasVnrArea"] = np.log1p(df["MasVnrArea"])
df["BsmtFinSF1"] = np.log1p(df["BsmtFinSF1"])
df["BsmtFinSF2"] = np.log1p(df["BsmtFinSF2"])
df["BsmtUnfSF"] = np.log1p(df["BsmtUnfSF"])
df["TotalBsmtSF"] = np.log1p(df["TotalBsmtSF"])
df["1stFlrSF"] = np.log1p(df["1stFlrSF"])
df["2ndFlrSF"] = np.log1p(df["2ndFlrSF"])
df["LowQualFinSF"] = np.log1p(df["LowQualFinSF"])
df["GrLivArea"] = np.log1p(df["GrLivArea"])
df["GarageArea"] = np.log1p(df["GarageArea"])
df["WoodDeckSF"] = np.log1p(df["WoodDeckSF"])
df["OpenPorchSF"] = np.log1p(df["OpenPorchSF"])
df["EnclosedPorch"] = np.log1p(df["EnclosedPorch"])
df["WoodDeckSF"] = np.log1p(df["WoodDeckSF"])
df["WoodDeckSF"] = np.log1p(df["WoodDeckSF"])
df["WoodDeckSF"] = np.log1p(df["WoodDeckSF"])
df["WoodDeckSF"] = np.log1p(df["WoodDeckSF"])
catVars = list(xtrain.select_dtypes(exclude="number").columns)
catVars.append("MSSubClass")
catVars.append("OverallQual")
catVars.append("OverallCond")
numVars = list(xtrain.select_dtypes(include="number").drop(columns=["Id", "MSSubClass", "OverallQual", "OverallCond"]).columns)
catIndices = [xtrain.columns.get_loc(col) for col in catVars]
numIndices = [xtrain.columns.get_loc(col) for col in numVars]
xtrain.loc[:,numVars] = xtrain.loc[:,numVars].fillna(0)
xtest.loc[:,numVars] = xtest.loc[:,numVars].fillna(0)
transformX(xtrain)
transformX(xtest)from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
pipelineNoScaling = ColumnTransformer([
("cat", OneHotEncoder(handle_unknown="ignore"), catIndices),
("num", "passthrough", numIndices)
])
pipelineScaling = ColumnTransformer([
("cat", OneHotEncoder(handle_unknown="ignore"), catIndices),
("num", StandardScaler(), numIndices)
])The Models
- Elastic Net -> the one built last time
- Bagging Regressor w/ Decision Trees
- Random Forest Regressor
- AdaBoost w/ Decision Trees
- Gradient Boosting
- CatBoost
Lastly: use stacking to combine predictions from other models.
# Elastic Net
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
import numpy as np
from sklearn.pipeline import Pipeline
netPipeline = Pipeline([
('preprocessing', pipelineScaling),
('model', ElasticNet())
])
param_grid = {
'model__alpha': [0.01, 0.1, 1.0],
'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
grid_search = GridSearchCV(netPipeline, param_grid, cv=10,
scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(xtrain, ytrainLog)
elasticNetModel = grid_search.best_estimator_
predictionsLog = elasticNetModel.predict(xtest)
logRMSE = np.sqrt(np.mean((predictionsLog - ytestLog)**2))
print(f"Elastic Net Log-space RMSE: {logRMSE}")
predictions = np.expm1(predictionsLog)
originalRMSE = np.sqrt(np.mean((predictions - ytest)**2))
print(f"Elastic Net Original-space RMSE: {originalRMSE}")Elastic Net Log-space RMSE: 0.11601083060170428
Elastic Net Original-space RMSE: 22587.012058965793
# Bagging Regressor w/ Decision Trees
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
# Note: Grid Search was performed on each model to find good hyperparams. They are commented out to speed up notebook execution.
# bagPipeline = Pipeline([
# ('preprocessing', pipelineNoScaling),
# ('model', DecisionTreeRegressor())
# ])
# param_grid = {
# 'estimator__model__max_depth': [7, 10, 15, 20],
# 'estimator__model__max_leaf_nodes': [64, 128, 256, 512],
# 'estimator__model__min_samples_split': [5,10,20,50]
# }
# bagReg = BaggingRegressor(bagPipeline, n_jobs=-1, n_estimators=200, max_samples=.5)
# randSearch = RandomizedSearchCV(bagReg, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, n_iter=20)
# randSearch.fit(xtrain,ytrainLog)
# bagModel = randSearch.best_estimator_
bagPipeline = Pipeline([
('preprocessing', pipelineNoScaling),
('model', DecisionTreeRegressor(max_depth=10, max_leaf_nodes=512, min_samples_split=15))
])
bagReg = BaggingRegressor(bagPipeline, n_jobs=-1, n_estimators=200, max_samples=.5)
bagModel = bagReg.fit(xtrain,ytrainLog)
predictionsLog = bagModel.predict(xtest)
logRMSE = np.sqrt(np.mean((predictionsLog - ytestLog)**2))
print(f"Bagging Log-space RMSE: {logRMSE}")
predictions = np.expm1(predictionsLog)
originalRMSE = np.sqrt(np.mean((predictions - ytest)**2))
print(f"Bagging Original-space RMSE: {originalRMSE}")Bagging Log-space RMSE: 0.13714765596552775
Bagging Original-space RMSE: 27072.43921882722
# Random Forest Regressor
from numpy.core.umath import maximum
from sklearn.ensemble import RandomForestRegressor
# rfPipeline = Pipeline([
# ('preprocessing', pipelineNoScaling),
# ('model', RandomForestRegressor(n_estimators=200))
# ])
# param_grid = {
# 'model__max_depth': [7, 10, 15, 20],
# 'model__max_leaf_nodes': [64, 128, 256, 512],
# 'model__min_samples_split': [5,10,20,50]
# }
# randSearch = RandomizedSearchCV(rfPipeline, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, n_iter=20)
# randSearch.fit(xtrain,ytrainLog)
# rfModel = randSearch.best_estimator_
rfPipeline = Pipeline([
('preprocessing', pipelineNoScaling),
('model', RandomForestRegressor(n_estimators=400, max_depth=10, max_leaf_nodes=700, min_samples_split=10, n_jobs=-1))
])
rfModel = rfPipeline.fit(xtrain,ytrainLog)
predictionsLog = rfModel.predict(xtest)
logRMSE = np.sqrt(np.mean((predictionsLog - ytestLog)**2))
print(f"Random Forest Log-space RMSE: {logRMSE}")
predictions = np.expm1(predictionsLog)
originalRMSE = np.sqrt(np.mean((predictions - ytest)**2))
print(f"Random Forest Original-space RMSE: {originalRMSE}")Random Forest Log-space RMSE: 0.13332204191522587
Random Forest Original-space RMSE: 26559.271547277673
# AdaBoost w/ Decision Trees
from sklearn.ensemble import AdaBoostRegressor
# adaPipeline = Pipeline([
# ('preprocessing', pipelineNoScaling),
# ('model', AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=200, learning_rate=.25))
# ])
# param_grid = {
# 'model__estimator__max_depth': [7, 10, 15, 20],
# 'model__estimator__max_leaf_nodes': [64, 128, 256, 512],
# 'model__estimator__min_samples_split': [5,10,20,50]
# }
# randSearch = RandomizedSearchCV(adaPipeline, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, n_iter=20)
# randSearch.fit(xtrain,ytrainLog)
# adaModel = randSearch.best_estimator_
adaPipeline = Pipeline([
('preprocessing', pipelineNoScaling),
('model', AdaBoostRegressor(DecisionTreeRegressor(max_depth=15, max_leaf_nodes=512, min_samples_split=15), n_estimators=200, learning_rate=.2))
])
adaModel = adaPipeline.fit(xtrain,ytrainLog)
predictionsLog = adaModel.predict(xtest)
logRMSE = np.sqrt(np.mean((predictionsLog - ytestLog)**2))
print(f"Adaboost Log-space RMSE: {logRMSE}")
predictions = np.expm1(predictionsLog)
originalRMSE = np.sqrt(np.mean((predictions - ytest)**2))
print(f"Adaboost Original-space RMSE: {originalRMSE}")Adaboost Log-space RMSE: 0.12707190516224603
Adaboost Original-space RMSE: 25975.415832856324
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
# gbPipeline = Pipeline([
# ('preprocessing', pipelineNoScaling),
# ('model', GradientBoostingRegressor(max_depth=15, max_leaf_nodes=600, min_samples_split=10))
# ])
# param_grid = {
# 'model__learning_rate': [.05, .1, .25, .5, .75],
# 'model__n_estimators': [100, 200, 350, 500]
# }
# randSearch = RandomizedSearchCV(gbPipeline, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, n_iter=20)
# randSearch.fit(xtrain,ytrainLog)
# gbModel = randSearch.best_estimator_
gbPipeline = Pipeline([
('preprocessing', pipelineNoScaling),
('model', GradientBoostingRegressor(max_depth=10, max_leaf_nodes=512, min_samples_split=10, learning_rate=.1, n_estimators=200))
])
gbModel = gbPipeline.fit(xtrain,ytrainLog)
predictionsLog = gbModel.predict(xtest)
logRMSE = np.sqrt(np.mean((predictionsLog - ytestLog)**2))
print(f"Gradient Boosting Log-space RMSE: {logRMSE}")
predictions = np.expm1(predictionsLog)
originalRMSE = np.sqrt(np.mean((predictions - ytest)**2))
print(f"Gradient Boosting Original-space RMSE: {originalRMSE}")Gradient Boosting Log-space RMSE: 0.13848294511063375
Gradient Boosting Original-space RMSE: 28934.793128350313
# CatBoost
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
def fill_categorical_na(X):
X_copy = X.copy()
for col in catBoostVars:
X_copy[col] = X_copy[col].fillna('Missing')
return X_copy
catBoostVars = catVars[:-3]
catPipeline = Pipeline([
('imputer', FunctionTransformer(fill_categorical_na)),
('model', CatBoostRegressor(cat_features=catBoostVars,logging_level="Silent", thread_count=-1))
])
catModel = catPipeline.fit(xtrain, ytrainLog)
predictionsLog = catModel.predict(xtest)
logRMSE = np.sqrt(np.mean((predictionsLog - ytestLog)**2))
print(f"Catboost Log-space RMSE: {logRMSE}")
predictions = np.expm1(predictionsLog)
originalRMSE = np.sqrt(np.mean((predictions - ytest)**2))
print(f"Catboost Original-space RMSE: {originalRMSE}")Catboost Log-space RMSE: 0.11527835635550543
Catboost Original-space RMSE: 22649.277824848843
Stacking
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
stack = StackingRegressor(
estimators=[
("elasticNet", elasticNetModel),
("randForest", rfModel),
("ada", adaModel),
("catBoost", catModel)
],
final_estimator=Ridge(alpha=3),
cv=10,
n_jobs=-1
)
stack.fit(xtrain,ytrainLog)
predictionsLog = stack.predict(xtest)
logRMSE = np.sqrt(np.mean((predictionsLog - ytestLog)**2))
print(f"Stacking Log-space RMSE: {logRMSE}")
predictions = np.expm1(predictionsLog)
originalRMSE = np.sqrt(np.mean((predictions - ytest)**2))
print(f"Stacking Original-space RMSE: {originalRMSE}")Stacking Log-space RMSE: 0.1097353694673458
Stacking Original-space RMSE: 21425.179045261502
Final Model
xvals.loc[:,numVars] = xvals.loc[:,numVars].fillna(0)
submissionData.loc[:,numVars] = submissionData.loc[:,numVars].fillna(0)
transformX(xvals)
transformX(submissionData)
yvalsLog = np.log1p(yvals)
stack.fit(xvals,yvalsLog)
predictionsLog = stack.predict(submissionData)
predictions = np.expm1(predictionsLog)submission = pd.DataFrame({
'Id': submissionData["Id"],
'SalePrice': predictions
})
submission.to_csv('submission.csv', index=False)Final Kaggle Score: .12214
Ideas on how to get lower: - LightGBM, XGBoost models - More feature engineering - Fine tune CatBoost