from sklearn import tree # Tree module
import matplotlib.pyplot as plt # to adjust tree plot

X = data_ml.iloc[:,3:96] # recall features/predictors, full sample
y = data_ml['R1M_Usd'] # recall label/Dependent variable, full sample

fit_tree = tree.DecisionTreeRegressor( # Definining the model
  min_samples_split = 8000, # Min nb of obs required to continue splitting 
  max_depth = 3, # Maximum depth (i.e. tree levels)
  ccp_alpha=0.000001, # complexity parameters
  min_samples_leaf =3500 # Min nb of obs required in each terminal node (leaf)
        )
fit_tree.fit(X, y) # Fitting the model
fig, ax = plt.subplots(figsize=(13, 8)) # resizing
tree.plot_tree(fit_tree,feature_names=X.columns.values, ax=ax) # Plot the tree
plt.show()


y_pred=fit_tree.predict(X.iloc[0:6,:])   # Test (prediction) on the first six instances of the sample


print(f'y_pred: {y_pred}')

y_pred: [0.01088066 0.01088066 0.01088066 0.01088066 0.01088066 0.01088066]


import seaborn as sns
unpivoted_data_ml = pd.melt(data_ml[['R1M_Usd','Mkt_Cap_12M_Usd','Pb','Advt_3M_Usd']], id_vars='R1M_Usd') # selecting and putting in vector
sns.lineplot(data = unpivoted_data_ml, y='R1M_Usd', x='value', hue='variable'); # Plot from seaborn


y_train = training_sample['R1M_Usd'].values # recall features/predictors, full sample
X_train = training_sample[features].values # recall label/Dependent variable, full sample

fit_tree2 = tree.DecisionTreeRegressor( # Definining the model
  min_samples_split = 4000, # Min nb of obs required to continue splitting
  max_depth = 5, # Maximum depth (i.e. tree levels)
  ccp_alpha=0.0001, # complexity parameters
  min_samples_leaf =1500 # Min nb of obs required in each terminal node (leaf)
        )
fit_tree2 = fit_tree2.fit(X_train, y_train) # Fitting the model

mse = np.mean((fit_tree2.predict(X_test) - y_test)**2)
print(f'MSE: {mse}')

MSE: 0.03699695809185004


hitratio = np.mean(fit_tree2.predict(X_test) * y_test > 0)
print(f'Hit Ratio: {hitratio}')

Hit Ratio: 0.5460346399270738


from sklearn.ensemble import RandomForestRegressor

fit_RF = RandomForestRegressor(n_estimators = 40, # Nb of random trees
criterion ='mse', # function to measure the quality of a split
min_samples_split= 250, # Minimum size of terminal cluster
bootstrap=False, # replacement
max_features=30, # Nb of predictive variables for each tree
max_samples=10000 # Size of (random) sample for each tree
)
fit_RF.fit(X_train, y_train) # Fitting the model
fit_RF.predict(pd.DataFrame(X_test).iloc[0:5,]) # Prediction over the first 5 test instances

array([ 0.00139083,  0.02137373,  0.04259802, -0.01310026,  0.00028897])


from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_test, fit_RF.predict(X_test))
print(f'MSE: {mse}')

MSE: 0.03686227217696956


hitratio = np.mean(fit_RF.predict(X_test) * y_test > 0)
print(f'Hit Ratio: {hitratio}')

Hit Ratio: 0.5320476298997265


from sklearn.ensemble import RandomForestClassifier

fit_RF_C = RandomForestClassifier( 
n_estimators = 40, # Nb of random trees
criterion ='gini', # function to measure the quality of a split
min_samples_split= 250, # Minimum size of terminal cluster
bootstrap=False, # replacement
max_features=30, # Nb of predictive variables for each tree
max_samples=20000 # Size of (random) sample for each tree
)
fit_RF_C=fit_RF_C.fit(X_train, y_c_train) # Fitting the model


hitratio = np.mean(fit_RF_C.predict(X_test) == y_c_test)
print(f'Hit Ratio: {hitratio}')

Hit Ratio: 0.5030480856882407


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

fit_adaboost_C = AdaBoostClassifier(DecisionTreeClassifier(
        max_depth=3), # depth of the tree
        n_estimators=3) # Number of trees 
fit_adaboost_C.fit(X_train, y_c_train) # Fitting the model

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                   n_estimators=3)


from sklearn.metrics import accuracy_score # introducing buit-in function for accuracy
hitratio=accuracy_score(y_c_test, fit_adaboost_C.predict(X_test)) # Hitratio
print(f'Hit Ratio: {hitratio}')

Hit Ratio: 0.49641066545123064


import xgboost as xgb # The package for boosted trees

data_ml['R1M_Usd_quantile'] = data_ml.groupby('date')['R1M_Usd'].transform(         # creating quantile... 
        lambda x: pd.qcut(x, 100, labels=False, duplicates=('drop'), precision=50)) # ...for selecting extreme values

boolean_quantile=(data_ml.loc[separation_mask]['R1M_Usd_quantile'].        # boolean array for selecting rows
                  values<=0.2) | (data_ml.loc[separation_mask]['R1M_Usd_quantile'].values>=0.8) # selecting extreme values

###############################################################################
train_features_xgb=training_sample.loc[boolean_quantile,features_short] # Independent variables
train_label_xgb=training_sample.loc[boolean_quantile,'R1M_Usd'] # Dependent variable
train_matrix_xgb=xgb.DMatrix(train_features_xgb, label=train_label_xgb) # XGB format!


mono_const="(0, 0, -1, 1, 0, -1, 0)" # Initialize the vector -- "-1" == decreasing, "+1" increasing 
# Decreasing in market cap -- mono_const[2]
# Increasing in past return -- mono_const[3]
# Decreasing in price-to-book -- mono_const[5]


params={'eta' : 0.3,                          # Learning rate
  'objective' : "reg:squarederror",     # Objective function
  'max_depth' : 4,                      # Maximum depth of trees
  'subsample' : 0.6,                    # Train on random 60% of sample
  'colsample_bytree' : 0.7,             # Train on random 70% of predictors
  'lambda' : 1,                         # Penalisation of leaf values
  'gamma' : 0.1,                        # Penalisation of number of leaves
  'nrounds' : 30,                       # Number of trees used (rather low here)
  'monotone_constraints' : mono_const,  # Monotonicity constraints
  'rate_drop' : 0.1,                    # Drop rate for DART
  'verbose' : 0}                      # No comment from the algo 

fit_xgb =xgb.train(params, train_matrix_xgb)

[10:27:17] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/learner.cc:627: 
Parameters: { "nrounds", "rate_drop", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


test_features_xgb=testing_sample[features_short] # Test sample => XGB format
test_matrix_xgb=xgb.DMatrix(test_features_xgb, label=y_test) # XGB format!

fit_xgb.predict(test_matrix_xgb)
mse = np.mean((fit_xgb.predict(test_matrix_xgb) - y_test)**2)
print(f'MSE: {mse}')

MSE: 0.03781719994386558


hitratio = np.mean(fit_xgb.predict(test_matrix_xgb) * y_test > 0)
print(f'Hit Ratio: {hitratio}')

Hit Ratio: 0.5460346399270738


train_label_xgb_C=training_sample.loc[boolean_quantile,'R1M_Usd_C'] # Dependent variable
train_matrix_xgb_C=xgb.DMatrix(train_features_xgb, label=train_label_xgb_C) # XGB format!


params_C={'eta' : 0.8,                  # Learning rate
  'objective' : "multi:softmax",        # Objective function
  'max_depth' : 4,                      # Maximum depth of trees
  'num_class' : 2,                      # number of classes
  'nrounds' : 10,                       # Number of trees used (rather low here)
  'verbose' : 0}                        # No comment from the algo 

fit_xgb_C =xgb.train(params_C, train_matrix_xgb_C)

[10:29:12] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/learner.cc:627: 
Parameters: { "nrounds", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


hitratio = np.mean(fit_xgb_C.predict(test_matrix_xgb) == y_c_test)
print(f'Hit Ratio: {hitratio}')

Hit Ratio: 0.49846171376481313


inst_weights = np.random.uniform(0,1,(train_features_xgb.shape[0],1)) # Random weights   
train_matrix_xgb=xgb.DMatrix(train_features_xgb, label=train_label_xgb, # XGB format!
                                weight = inst_weights)        # Weights!

	Bin. classif. (orig. Adaboost)	Regression (Drucker (1997))
Individual error	$\epsilon_i=\textbf{1}_{\left\{y_i\neq l_m(\textbf{x}_i) \right\}}$	$\epsilon_i=\frac{	y_i- l_m(\textbf{x}_i)	}{\underset{i}{\max}	y_i- l_m(\textbf{x}_i)	}$
Weight of learner via $f_a$	$f_a=\log\left(\frac{1-\epsilon}{\epsilon} \right)$,with $\epsilon=I^{-1}\sum_{i=1}^Iw_i \epsilon_i$	$f_a=\log\left(\frac{1-\epsilon}{\epsilon} \right)$,with $\epsilon=I^{-1}\sum_{i=1}^Iw_i \epsilon_i$
Weight of instances via $f_w(i)$	$f_w=f_a\epsilon_i$	$f_w=f_a\epsilon_i$
Output function via $f_y$	$f_y(x) = \text{sign}(x)$	weighted median of predictions

Chapter 6 Tree-based methods¶

6.1 Simple trees¶

6.1.1 Principle¶

6.1.2 Further details on classification¶

6.1.3 Pruning criteria¶

6.1.4 Code and interpretation¶

6.2 Random forests¶

6.2.1 Principle¶

6.2.2 Code and results¶

6.3 Boosted trees: Adaboost¶

6.3.1 Methodology¶

6.3.2 Illustration¶

6.4 Boosted trees: extreme gradient boosting¶

6.4.1 Managing Loss¶

6.4.2 Penalisation¶

6.4.3 Aggregation¶

6.4.4 Tree structure¶

6.4.5 Extensions¶

6.4.6 Code and results¶

6.4.7 Instance weighting¶

6.5 Discussion¶

6.6 Coding exercices¶

References¶