from sklearn.metrics import roc_curve, RocCurveDisplay, auc # Module for AUC computation
import matplotlib.pyplot as plt
import numpy as np

fpr, tpr, thresholds = roc_curve(testing_sample['R1M_Usd_C'].values,
                                 fit_RF_C.predict(testing_sample[features]))
roc_auc = auc(fpr, tpr)
display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='example estimator')
display.plot()
plt.show()


print(f'AUC: {roc_auc}')

AUC: 0.5021678143170378


### recalling variable from chapter 5
ridge_bias = []
ridge_var = []
for alpha in range(0,len(alphas),1):
    predictions=np.dot((df_ridge_res.iloc[alpha,:].values),X_penalized.T)
    ridge_bias.append(np.sum(np.square(predictions - y_penalized)))
    ridge_var.append(np.var(predictions))
df = pd.DataFrame(list(zip(ridge_bias, ridge_var)),
               columns =['ridge_bias^2', 'ridge_var'])
df['total']=df['ridge_bias^2']+df['ridge_var']
df.plot(subplots=True,title='Error Component',xlabel='Lambda')

array([<AxesSubplot:xlabel='Lambda'>, <AxesSubplot:xlabel='Lambda'>,
       <AxesSubplot:xlabel='Lambda'>], dtype=object)


from sklearn import tree # Tree module
import matplotlib.pyplot as plt # to adjust tree plot

X = training_sample[features] # recall features/predictors, full sample
y = y_train # recall label/Dependent variable, full sample

fit_tree_simple = tree.DecisionTreeRegressor( # Definining the model
  max_depth = 2, # Maximum depth (i.e. tree levels)
  ccp_alpha=0.000001, # complexity parameters
        )
fit_tree_simple.fit(X, y) # Fitting the model
fig, ax = plt.subplots(figsize=(13, 8)) # resizing
tree.plot_tree(fit_tree_simple,feature_names=X.columns.values, ax=ax) # Plot the tree
plt.show()


bias_tree = np.mean(fit_tree_simple.predict(X_test) - y_test)
print(f'bias: {bias_tree}')

bias: 0.004973916538330352


var_tree = np.var(fit_tree_simple.predict(X_test))
print(f'var: {var_tree}')

var: 0.0001397982854475224


bias_xgb = np.mean(fit_xgb.predict(test_matrix_xgb) - y_test)
print(f'bias: {bias_xgb}')

bias: 0.019378203027941212


var_xgb = np.var(fit_xgb.predict(test_matrix_xgb))
print(f'var: {var_xgb}')

var: 0.0011795820901170373


import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error

scorer = make_scorer(mean_absolute_error)
# A parameter grid for XGBoost
params = {
        'learning_rate': [0.1, 0.3, 0.5, 0.7, 0.9], # Values for eta
        'n_estimators': [10, 50,100],         # Values for nrounds
        'reg_lambda': [0.01, 0.1, 1, 10, 100] # Values for lambda
        }


print(params)

{'learning_rate': [0.1, 0.3, 0.5, 0.7, 0.9], 'n_estimators': [10, 50, 100], 'reg_lambda': [0.01, 0.1, 1, 10, 100]}


model = xgb.XGBRegressor(max_depth=3, n_jobs=-1,objective='reg:squarederror')
model_gs = GridSearchCV(model,param_grid=params,cv=2,scoring='neg_mean_squared_error')
model_gs.fit(X_train,y_train)
cv_results=pd.DataFrame(model_gs.cv_results_)
print(f'Best Parameters using grid search: {model_gs.best_params_}')

Best Parameters using grid search: {'learning_rate': 0.1, 'n_estimators': 50, 'reg_lambda': 100}


res_df = pd.DataFrame(cv_results, 
                      columns = ["param_n_estimators","param_learning_rate","param_reg_lambda","mean_test_score"])
# Note, MAE is made negative in the scikit-learn library so that it can be maximized. 
# As such, we can ignore the sign and assume all errors are positive.
res_df['mean_test_score']=-res_df['mean_test_score'].values

fig, axes = plt.subplots(figsize=(16, 9),nrows=3, ncols=5)
ax_all = plt.gca()

cnt = 0
for param, tmp in res_df.groupby(["param_n_estimators", "param_reg_lambda"]):
    ax = axes[cnt//5][cnt%5] # get the ax
    np.round(tmp[["param_learning_rate","mean_test_score"]],2).plot.bar(ax=ax, x="param_learning_rate", y="mean_test_score",
                                            alpha=0.5,legend=None)
    ax.set_xlabel("") # no xlabel
    ax.set_ylim(0, 0.1) # set y range
    # adjust xtick labels / adjust ytick labels
    if cnt//5 < 2:
        ax.xaxis.set_ticklabels("")
    else:
        for label in ax.get_xticklabels():
            label.set_rotation(0);
    if cnt%5 > 0:
        ax.yaxis.set_ticklabels("")
    # set title
    ax.set_title(f"num_trees={param[0]},\n reg_lambda={param[1]}",fontsize=10);
    # update
    cnt =cnt+1


from skopt import BayesSearchCV  # module for Bayesian optimisation on the scikit learn backend
search_spaces = params           # we use the param grid from previous section
opt = BayesSearchCV(estimator=model,          # Wrapping everything up into the Bayesian optimizer                          
                    search_spaces=search_spaces,                      
                    scoring='neg_mean_squared_error',                                  
                    cv=2)                                   # cross validation with 2-fold, we keep it light for computing time sake
opt.fit(X_train,y_train)
cv_results_opt=pd.DataFrame(opt.cv_results_)


print(f'Best Parameters using bayes opt: {opt.best_params_}')

Best Parameters using bayes opt: OrderedDict([('learning_rate', 0.1), ('n_estimators', 50), ('reg_lambda', 100.0)])


res_df = pd.DataFrame(cv_results_opt, 
                      columns = ["param_n_estimators","param_learning_rate","param_reg_lambda","mean_test_score"])
# Note, MAE is made negative in the scikit-learn library so that it can be maximized. 
# As such, we can ignore the sign and assume all errors are positive.
res_df['mean_test_score']=-res_df['mean_test_score'].values

fig, axes = plt.subplots(figsize=(16, 9),nrows=3, ncols=5)
ax_all = plt.gca()

cnt = 0
for param, tmp in res_df.groupby(["param_n_estimators", "param_reg_lambda"]):
    ax = axes[cnt//5][cnt%5] # get the ax
    np.round(tmp[["param_learning_rate","mean_test_score"]],2).plot.bar(ax=ax, x="param_learning_rate", y="mean_test_score",
                                            alpha=0.5,legend=None)
    ax.set_xlabel("") # no xlabel
    ax.set_ylim(0, 0.1) # set y range
    # adjust xtick labels / adjust ytick labels
    if cnt//5 < 2:
        ax.xaxis.set_ticklabels("")
    else:
        for label in ax.get_xticklabels():
            label.set_rotation(0);
    if cnt%5 > 0:
        ax.yaxis.set_ticklabels("")
    # set title
    ax.set_title(f"num_trees={param[0]},\n reg_lambda={param[1]}",fontsize=10);
    # update
    cnt =cnt+1

Chapter 10 Validating and tuning¶

10.1 Learning metrics¶

10.1.1 Regression analysis¶

10.1.2 Classification analysis¶

10.2 Validation¶

10.2.1 The variance-bias tradeoff: theory¶

10.2.2 The variance-bias tradeoff: illustration¶

10.2.3 The risk of overfitting: principle¶

10.2.4 The risk of overfitting: some solutions¶

10.3 The search for good hyperparameters¶

10.3.1 Methods¶

10.3.2 Example: grid search¶

10.3.3 Example: Bayesian optimization¶

10.4 Short discussion on validation backtests¶

References¶