Model ensembling is using a combination of models in order to increase performance beyond any of the individual models
An ensemble is an type of supervised algorithm where we give a weight to the output of each model under consideration. In practice, we tune the weights in order to maximize performance on a validation set. We also set a restriction on the weights making them sum to one.
Different types of models have different strengths and weakness
Some models are better at seperating certain types of noise from signal
Some models are better at avoiding overfitting/underfitting to certain types of data
Having multiple models will likely reduce the variance in our predictions, reducing overfitting.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
%matplotlib inline
np.random.seed(1)
X, y = load_boston(return_X_y=True)
# 50% train, 25% val, 25% test
num_train = int(round(len(X)*0.6, 0))
num_val = int(round(len(X)*0.2, 0))
random_indices = np.random.permutation(len(X))
train_indices = random_indices[:num_train]
val_indices = random_indices[num_train:(num_train+num_val)]
test_indices = random_indices[(num_train+num_val):]
X_train = X[train_indices]
y_train = y[train_indices]
X_val = X[val_indices]
y_val = y[val_indices]
X_test = X[test_indices]
y_test = y[test_indices]
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_val shape:', X_val.shape)
print('y_val shape:', y_val.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)
pd.DataFrame(X_train).head()
X_train = (X_train - X_train.mean(axis=0))/X_train.std(axis=0)
X_val = (X_val - X_train.mean(axis=0))/X_train.std(axis=0)
X_test = (X_test - X_train.mean(axis=0))/X_train.std(axis=0)
pd.DataFrame(X_train).head()
svm = SVR(kernel='linear', epsilon=8, C=0.0003)
rf = RandomForestRegressor(n_estimators=64, max_depth=2, max_features=3)
svm.fit(X_train, y_train)
rf.fit(X_train, y_train)
print('done training')
y_vpred_svm = svm.predict(X_val)
y_vpred_rf = rf.predict(X_val)
mse = []
for i in np.arange(0, 1.1, 0.1):
# start with 100% svm
ensemble_pred = y_vpred_svm*(1-i) + y_vpred_rf*i
mse.append(mean_squared_error(y_val, ensemble_pred))
plt.plot(np.arange(0, 1.1, 0.1), mse)
#plt.ylim(0)
plt.xlabel('Weight for random forest model')
plt.ylabel('MSE')
plt.show()
ensemble_scores = pd.DataFrame({
'RF Weight':np.arange(0, 1.1, 0.1),
'MSE':mse
})
ensemble_scores
rf_weight = ensemble_scores['RF Weight'].iloc[ensemble_scores['MSE'].idxmin()]
y_pred_svm = svm.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_ensemble = (1-rf_weight)*y_pred_svm + rf_weight*y_pred_rf
print('SVM MSE on test set:', mean_squared_error(y_test, y_pred_svm))
print('RF MSE on test set:', mean_squared_error(y_test, y_pred_rf))
print('Ensemble MSE on test set:', mean_squared_error(y_test, y_pred_ensemble))