In [3]:
# Imports
# Ignore Warnings 
import warnings
warnings.filterwarnings('ignore')

# Basic Imports 
import numpy as np
import pandas as pd
import time

# Preprocessing
from sklearn.model_selection import train_test_split, KFold, cross_val_score

# Metrics 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ML Models
from lightgbm import LGBMRegressor 
from xgboost import XGBRegressor 
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import svm
from mlxtend.regressor import StackingCVRegressor

In [4]:
# Reading a CSV File
# 9015
df_NN = pd.read_csv("~/data/Melbourne_housing_pre.csv",  encoding="utf-8")

X=df_NN[['Year','YearBuilt','Distance','Lattitude','Longtitude','Propertycount',
          'Landsize','BuildingArea', 'Rooms','Bathroom', 'Car','Type_h','Type_t','Type_u']]
y=df_NN['LogPrice']
train_X, valid_X, train_y, valid_y = train_test_split(X,y, test_size = .20, random_state=42)

train_X2 = train_X.copy()
valid_X2 = valid_X.copy()

# Data standardization
mean = train_X.mean(axis=0)
train_X -= mean
std = train_X.std(axis=0)
train_X /= std
valid_X -= mean
valid_X /= std

In [5]:
##% evaluateRegressor
# from sklearn.metrics import mean_squared_error, mean_absolute_error
def evaluateRegressor(true,predicted,message = "Test set"):
    R2 = r2_score(true,predicted)
    print(message)
    print("R2 :", R2)

In [6]:
##% Initial Models
SVM = svm.SVR().fit(train_X, train_y) 
RFReg = RandomForestRegressor(random_state = 0).fit(train_X, train_y)
GBReg = GradientBoostingRegressor(random_state=0).fit(train_X,train_y)
XGReg = XGBRegressor(objective ='reg:squarederror', seed = 0,verbosity=0).fit(train_X,train_y) 
LGBMReg = LGBMRegressor(random_state=0).fit(train_X,train_y)
CatReg = CatBoostRegressor(random_seed=0, verbose=False).fit(train_X, train_y)
# Stack up all the models above, optimized using xgboost
# 用本数据集中性能最好的CatReg模型作元数据回归 
stack_gen = StackingCVRegressor(regressors=(GBReg, XGReg, LGBMReg, CatReg, RFReg),             
                                meta_regressor=CatReg,                        
                                use_features_in_secondary=True)


# 以下是调参之前的性能

In [7]:
print("Support Vector Machine") 
acc = cross_val_score(SVM, train_X, train_y).mean() 
print('acc: ',acc)
predicted_train_y = SVM.predict(train_X)
evaluateRegressor(train_y,predicted_train_y,"    Training Set")
predicted_valid_y = SVM.predict(valid_X)
evaluateRegressor(valid_y,predicted_valid_y,"    Test Set")

Support Vector Machine
acc:  0.866595302426467
    Training Set
R2 : 0.8993062555141255
    Test Set
R2 : 0.8611643128805611




In [8]:
##% Model Metrics
print("Random Forest Regressor") 
acc = cross_val_score(RFReg, train_X, train_y).mean() 
print('acc: ',acc)
predicted_train_y = RFReg.predict(train_X)
evaluateRegressor(train_y,predicted_train_y,"    Training Set")
predicted_valid_y = RFReg.predict(valid_X)
evaluateRegressor(valid_y,predicted_valid_y,"    Test Set")

Random Forest Regressor
acc:  0.8800957337658776
    Training Set
R2 : 0.98384032693932
    Test Set
R2 : 0.8753312384639034


In [9]:
print("GB Regressor") 
acc = cross_val_score(GBReg, train_X, train_y).mean() 
print('acc: ',acc)
predicted_train_y = GBReg.predict(train_X)
evaluateRegressor(train_y,predicted_train_y,"    Training Set")
predicted_valid_y = GBReg.predict(valid_X)
evaluateRegressor(valid_y,predicted_valid_y,"    Test Set")

GB Regressor
acc:  0.863957355555318
    Training Set
R2 : 0.8841496339500643
    Test Set
R2 : 0.8580268336558272


In [10]:
print("XGBoost Regressor") 
acc = cross_val_score(XGReg, train_X, train_y).mean() 
print('acc: ',acc)
predicted_train_y = XGReg.predict(train_X)
evaluateRegressor(train_y,predicted_train_y,"    Training Set")
predicted_valid_y = XGReg.predict(valid_X)
evaluateRegressor(valid_y,predicted_valid_y,"    Test Set")

XGBoost Regressor
acc:  0.8925212846034665
    Training Set
R2 : 0.9744489742890312
    Test Set
R2 : 0.8881685589231831


In [11]:
print("LightGBM Regressor") 
acc = cross_val_score(LGBMReg, train_X, train_y).mean() 
print('acc: ',acc)
predicted_train_y = LGBMReg.predict(train_X)
evaluateRegressor(train_y,predicted_train_y,"    Training Set")
predicted_valid_y = LGBMReg.predict(valid_X)
evaluateRegressor(valid_y,predicted_valid_y,"    Test Set")

LightGBM Regressor
acc:  0.8958033981316488
    Training Set
R2 : 0.9386913909702984
    Test Set
R2 : 0.8885361413556274


In [12]:
print("CatBoost Machine") 
acc = cross_val_score(LGBMReg, train_X, train_y).mean() 
print('acc: ',acc)
predicted_train_y = CatReg.predict(train_X)
evaluateRegressor(train_y,predicted_train_y,"    Training Set")
predicted_valid_y = CatReg.predict(valid_X)
evaluateRegressor(valid_y,predicted_valid_y,"    Test Set")

CatBoost Machine
acc:  0.8958033981316488
    Training Set
R2 : 0.9520867059152777
    Test Set
R2 : 0.9024407889891998


In [13]:
print("stack_gen") 
t1=time.time()
stack_gen_model = stack_gen.fit(train_X, train_y)
acc = cross_val_score(stack_gen_model, train_X, train_y).mean() 
t2=time.time()
print(t2-t1)
print('acc: ',acc)
print('Train R2: ',stack_gen_model.score(train_X,train_y))
print('Test R2: ', stack_gen_model.score(valid_X,valid_y))

stack_gen
190.44106149673462
acc:  0.9054542946075245
Train R2:  0.9709790018643034
Test R2:  0.9010047075850618


In [14]:
# Blend models in order to make the final predictions more robust to overfitting
def blended_predictions(X):
    return (
            #(0.01 * lasso_model_full_data.predict(X)) + \
            #(0.03 * SVM.predict(X)) + \
            (0.09 * CatReg.predict(X)) + \
            (0.08 * LGBMReg.predict(X)) + \
            (0.07 * XGReg.predict(X)) + \
            #(0.05 * GBReg.predict(X)) +     
            (0.06 * RFReg.predict(X)) +      
            (0.70 * stack_gen_model.predict(np.array(X))))

print("Blend") 
print('Train R2: ',r2_score(train_y, blended_predictions(train_X))) 
print('Test R2: ',r2_score(valid_y, blended_predictions(valid_X))) 

Blend
Train R2:  0.9712192630097092
Test R2:  0.9020490034092113


In [15]:
# 以下是调参后优化的性能，CatBoost在本数据集中的性能最优。
# CatBoost，LightGBM，GBR, XGB, Stack, Blend这些模型测试集中预测的准确率都超过了90%。

# SVM best---------------------------------------------------------------------------------------
params =  {'C': 6.673350889023755, 'gamma': 0.05106238973376298}
print(params)

# Original best parameters of GridSearchCV()
# Set extra_trees=True to avoid overfitting
svr_best = svm.SVR(**params) 
acc = cross_val_score(svr_best, train_X, train_y).mean()
print(acc)

# predict
svr_model_full_data = svr_best.fit(train_X, train_y)
print(svr_model_full_data.score(train_X,train_y))
print(svr_model_full_data.score(valid_X,valid_y))

{'C': 6.673350889023755, 'gamma': 0.05106238973376298}
0.867673528970494
0.9109218210115111
0.8659740738531594


In [16]:
# Random forest best ----------------------------------------------------------------------------
params =  {'max_depth': 26, 'max_features': 4, 'max_leaf_nodes': 1781, 'max_samples': 0.9703002612349153, 'min_samples_leaf': 0, 'min_samples_split': 0, 'n_estimators': 381}
params['max_depth'] = params['max_depth']+3
params['min_samples_split'] = params['min_samples_split']+2
params['max_leaf_nodes'] = params['max_leaf_nodes']+2
params['min_samples_leaf'] = params['min_samples_leaf']+1
params['n_estimators'] = params['n_estimators']+50
params['max_features'] = params['max_features']+3
print(params)

# Original best parameters of GridSearchCV()
# Set extra_trees=True to avoid overfitting
rf_best = RandomForestRegressor( random_state = 0,verbose=0, **params)
acc = cross_val_score(rf_best, train_X, train_y).mean()
print(acc)

# predict
rf_model_best = rf_best.fit(train_X, train_y)
print(rf_model_best.score(train_X,train_y))
print(rf_model_best.score(valid_X,valid_y))      

{'max_depth': 29, 'max_features': 7, 'max_leaf_nodes': 1783, 'max_samples': 0.9703002612349153, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 431}
0.8875152357477226
0.9806938145048832
0.8834725572033766


In [17]:
# GBR best ------------------------------------------------------------------------------------
params = {'alpha': 0.9014933457984278, 'learning_rate': 0.035668343067947715, 'max_depth': 6, 'max_features': 5, 'max_leaf_nodes': 943, 'min_impurity_decrease': 0.015183480929538314, 'min_samples_leaf': 2, 'min_samples_split': 30, 'n_estimators': 440, 'subsample': 0.6109167820106534}
params['n_estimators'] = params['n_estimators']+50
params['min_samples_split'] = params['min_samples_split']+2
params['min_samples_leaf'] = params['min_samples_leaf']+1
params['max_depth'] = params['max_depth']+3
params['max_features'] = params['max_features']+3
params['max_leaf_nodes'] = params['max_leaf_nodes']+2
print(params)

# Original best parameters of GridSearchCV()
gbr_best = GradientBoostingRegressor(random_state=0,verbose=0, **params)
acc = cross_val_score(gbr_best, train_X, train_y).mean()
print(acc)

# predict
gbr_model_full_data = gbr_best.fit(train_X, train_y)
print(gbr_model_full_data.score(train_X,train_y))
print(gbr_model_full_data.score(valid_X,valid_y))

{'alpha': 0.9014933457984278, 'learning_rate': 0.035668343067947715, 'max_depth': 9, 'max_features': 8, 'max_leaf_nodes': 945, 'min_impurity_decrease': 0.015183480929538314, 'min_samples_leaf': 3, 'min_samples_split': 32, 'n_estimators': 490, 'subsample': 0.6109167820106534}
0.9048864580124658
0.9787904796126121
0.90169312112898


In [18]:
# XGB best --------------------------------------------------------------------------------------
params =  {'colsample_bytree': 0.8413894273173292, 'gamma': 0.008478702584417519, 'learning_rate': 0.05508679239402551, 'max_bin': 4, 'max_depth': 5, 'min_child_weight': 24.524635200338793, 'n_estimators': 578, 'reg_alpha': 0.809791155072757, 'reg_lambda': 1.4490119256389808, 'subsample': 0.8429852720715357}
params['max_bin'] = params['max_bin']+50
params['max_depth'] = params['max_depth']+3
params['n_estimators'] = params['n_estimators']+100
print(params)

# Original best parameters of GridSearchCV()
# Set extra_trees=True to avoid overfitting
xgb_best = XGBRegressor(objective ='reg:squarederror', seed = 0,verbosity=0, **params)  # CPU 4.96s/trial
acc = cross_val_score(xgb_best, train_X, train_y).mean()
print(acc)

# predict
xgb_model_best = xgb_best.fit(train_X, train_y)
print(xgb_model_best.score(train_X,train_y))
print(xgb_model_best.score(valid_X,valid_y))

{'colsample_bytree': 0.8413894273173292, 'gamma': 0.008478702584417519, 'learning_rate': 0.05508679239402551, 'max_bin': 54, 'max_depth': 8, 'min_child_weight': 24.524635200338793, 'n_estimators': 678, 'reg_alpha': 0.809791155072757, 'reg_lambda': 1.4490119256389808, 'subsample': 0.8429852720715357}
0.9071367470617598
0.9724046936744005
0.9006038008884029


In [19]:
# LigthGBM best -------------------------------------------------------------------------------
params = {'colsample_bytree': 0.5142540541056978, 'learning_rate': 0.014284678929509775, 'max_bin': 161, 'max_depth': 4, 'min_child_samples': 5, 'min_child_weight': 4.534457967283932, 'min_split_gain': 0.0006363777341674458, 'n_estimators': 2006, 'num_leaves': 93, 'reg_alpha': 0.0037820689583625278, 'reg_lambda': 2.947360470949046, 'subsample': 0.9448608935296047, 'subsample_freq': 2}
params['max_bin'] = params['max_bin']+50
params['max_depth'] = params['max_depth']+3
params['num_leaves'] = params['num_leaves']+20
params['min_child_samples'] = params['min_child_samples']+10
params['subsample_freq'] = params['subsample_freq']+1
params['n_estimators'] = params['n_estimators']+1000
print(params)

# Original best parameters of GridSearchCV()
# Set extra_trees=True to avoid overfitting
lgbm_best = LGBMRegressor(seed=0, **params)
acc = cross_val_score(lgbm_best, train_X, train_y).mean()
print(acc)

# predict
lgb_model_best = lgbm_best.fit(train_X, train_y)
print(lgb_model_best.score(train_X,train_y))
print(lgb_model_best.score(valid_X,valid_y))

{'colsample_bytree': 0.5142540541056978, 'learning_rate': 0.014284678929509775, 'max_bin': 211, 'max_depth': 7, 'min_child_samples': 15, 'min_child_weight': 4.534457967283932, 'min_split_gain': 0.0006363777341674458, 'n_estimators': 3006, 'num_leaves': 113, 'reg_alpha': 0.0037820689583625278, 'reg_lambda': 2.947360470949046, 'subsample': 0.9448608935296047, 'subsample_freq': 3}
0.9075054302960665
0.9790954758709947
0.9031159392716986


In [20]:
# CatBoost best --------------------------------------------------------------------------------------
# params = {'bagging_temperature': 0.5402870554069704, 'border_count': 183, 'depth': 5, 'fold_len_multiplier': 4.43906516804156, 'iterations': 899, 'l2_leaf_reg': 8.334167765336101, 'learning_rate': 0.0997818676941431, 'random_strength': 6.564979609549752, 'rsm': 0.8975065545697877, 'subsample': 0.857395221266925}
params = {'bagging_temperature': 0.5635882292015041, 'border_count': 248, 'depth': 6, 'fold_len_multiplier': 5.291168825056879, 'iterations': 1475, 'l2_leaf_reg': 12.565511945262646, 'learning_rate': 0.12388354971096996, 'random_strength': 10.351653421398712, 'rsm': 0.6923096830950537, 'subsample': 0.771573979543828}
params['border_count'] = params['border_count']+150
params['depth'] = params['depth']+2
params['iterations'] = params['iterations']+500
print(params)

# Original best parameters of GridSearchCV()
cat_best = CatBoostRegressor(task_type='CPU',
                             random_seed=0,
                             leaf_estimation_iterations=1,
                             max_ctr_complexity=0,                             
                             verbose=False, **params)  # CPU 44.64s/trial
acc = cross_val_score(cat_best, train_X, train_y).mean()
print(acc)

# predict
cat_model_best = cat_best.fit(train_X, train_y)
print(cat_model_best.score(train_X,train_y))
print(cat_model_best.score(valid_X,valid_y))

{'bagging_temperature': 0.5402870554069704, 'border_count': 333, 'depth': 7, 'fold_len_multiplier': 4.43906516804156, 'iterations': 1399, 'l2_leaf_reg': 8.334167765336101, 'learning_rate': 0.0997818676941431, 'random_strength': 6.564979609549752, 'rsm': 0.8975065545697877, 'subsample': 0.857395221266925}
0.9074151020398504
0.9755374523207733
0.9064909405009918


In [21]:
# Stack up all the models above, optimized using xgboost
# 用本数据集中性能最好的cat_best模型作元数据回归  
stack_gen_best = StackingCVRegressor(regressors=(gbr_best,xgb_best,lgbm_best,cat_best,rf_best),             
                                meta_regressor=cat_best,                         
                                use_features_in_secondary=True)

print("stack_gen") 
t1 = time.time()
stack_gen_model_best = stack_gen_best.fit(train_X, train_y)
acc = cross_val_score(stack_gen_model_best, train_X, train_y).mean() 
t2= time.time()
print(t2-t1)
print('acc: ',acc)
print('Train R2: ',stack_gen_model_best.score(train_X,train_y))
print('Test R2: ', stack_gen_model_best.score(valid_X,valid_y))

stack_gen
705.9645202159882
acc:  0.9052247762353561
Train R2:  0.9789969737785568
Test R2:  0.901629807478781


In [22]:
# Blend models in order to make the final predictions more robust to overfitting
def blended_predictions2(X):
    return (
            (0.09 * cat_model_best.predict(X)) + \
            (0.08 * lgb_model_best.predict(X)) + \
            (0.07 * xgb_model_best.predict(X)) + \
            #(0.06 * gbr_model_best.predict(X)) + 
            (0.06 * rf_model_best.predict(X)) +                 
            (0.70 * stack_gen_model_best.predict(np.array(X))))

print("Blend") 
print('Train R2: ',r2_score(train_y, blended_predictions2(train_X))) 
print('Test R2: ',r2_score(valid_y, blended_predictions2(valid_X))) 

Blend
Train R2:  0.980425534222321
Test R2:  0.9044248603023165
