library(xgboost)
= as.matrix(iris[,1:4])
X = as.integer(iris[,5]) - 1 # classes must be integers starting from 0
Y
= xgb.DMatrix(X, label = Y)
xgdata
# nrounds = number of trees in the ensemble
= xgboost(data = xgdata,
brt objective="multi:softprob",
nrounds = 50,
num_class = 3,
verbose = 0)
5 Boosted gradient trees
Boosted gradient machines achieve currently state-of-the-art performance for structured (tabular) data which makes them probably one of the most important algorithms for E&E where structured data dominates the field.
In the following, we use the ‘xgboost’ package (Chen et al. (2022)) (Python: ‘xgboost’ (Chen et al. (2022)), Julia: ‘MLJ’ (Blaom et al. (2019)))
5.1 Classification
Show feature importances:
xgb.importance(model = brt)
Feature Gain Cover Frequency
1: Petal.Length 0.671879438 0.57441039 0.3792049
2: Petal.Width 0.311535837 0.29261084 0.3088685
3: Sepal.Width 0.010177107 0.04910115 0.1162080
4: Sepal.Length 0.006407618 0.08387763 0.1957187
Make predictions (class probabilities):
head(matrix(predict(brt, newdata = xgb.DMatrix(X)), ncol =3), n = 3)
[,1] [,2] [,3]
[1,] 0.995287061 0.002195822 0.001027058
[2,] 0.003323558 0.995396435 0.001592265
[3,] 0.001389398 0.002407764 0.997380674
import xgboost as xgb
from sklearn import datasets
from sklearn.preprocessing import scale
= datasets.load_iris()
iris = scale(iris.data)
X = iris.target
Y
# Parameters:
= {
param 'max_depth':2,
'eta':1,
'objective':'multi:softmax' }
= 50
num_round
= xgb.XGBClassifier(param, num_round, verbosity = 0).fit(X, Y) model
/home/max/miniconda3/envs/r-sjsdm/lib/python3.9/site-packages/xgboost/core.py:568: FutureWarning: Pass `objective, use_label_encoder` as keyword args. Passing these as positional arguments will be considered as error in future releases.
warnings.warn(
Feature importance
model.feature_importances_
array([0.00959796, 0.01645038, 0.6765859 , 0.29736578], dtype=float32)
Make predictions:
0:10,:] model.predict_proba(X)[
array([[9.9680281e-01, 2.3831066e-03, 8.1413286e-04],
[9.9636227e-01, 2.3820533e-03, 1.2557388e-03],
[9.9680281e-01, 2.3831066e-03, 8.1413286e-04],
[9.9679452e-01, 2.3830866e-03, 8.2237815e-04],
[9.9680281e-01, 2.3831066e-03, 8.1413286e-04],
[9.9680281e-01, 2.3831066e-03, 8.1413286e-04],
[9.9680281e-01, 2.3831066e-03, 8.1413286e-04],
[9.9680281e-01, 2.3831066e-03, 8.1413286e-04],
[9.9636227e-01, 2.3820533e-03, 1.2557388e-03],
[9.9679452e-01, 2.3830866e-03, 8.2237815e-04]], dtype=float32)
import StatsBase;
using MLJ;
= @load XGBoostClassifier pkg=XGBoost;
BRT_classifier using RDatasets;
using StatsBase;
using DataFrames;
= dataset("datasets", "iris");
iris = mapcols(StatsBase.zscore, iris[:, 1:4]);
X = iris[:, 5]; Y
Models:
= fit!(machine(BRT_classifier(), X, Y)) model
trained Machine; caches model-specific representations of data
model: XGBoostClassifier(num_round = 100, …)
args:
1: Source @086 ⏎ Table{AbstractVector{Continuous}}
2: Source @683 ⏎ AbstractVector{Multiclass{3}}
Predictions:
predict(model, X)[1:5] MLJ.
5-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, String, UInt8, Float32}:
UnivariateFinite{Multiclass{3}}(setosa=>0.997, versicolor=>0.00238, virginica=>0.000814)
UnivariateFinite{Multiclass{3}}(setosa=>0.996, versicolor=>0.00238, virginica=>0.00126)
UnivariateFinite{Multiclass{3}}(setosa=>0.997, versicolor=>0.00238, virginica=>0.000814)
UnivariateFinite{Multiclass{3}}(setosa=>0.997, versicolor=>0.00238, virginica=>0.000822)
UnivariateFinite{Multiclass{3}}(setosa=>0.997, versicolor=>0.00238, virginica=>0.000814)
5.2 Regression
library(xgboost)
= as.matrix(iris[,2:4])
X = iris[,1]
Y
= xgb.DMatrix(X, label = Y)
xgdata
# nrounds = number of trees in the ensemble
= xgboost(data = xgdata,
brt objective="reg:squarederror",
nrounds = 50,
verbose = 0)
Show feature importances:
xgb.importance(model = brt)
Feature Gain Cover Frequency
1: Petal.Length 0.86781219 0.4789538 0.3789062
2: Petal.Width 0.06987880 0.2128402 0.2626953
3: Sepal.Width 0.06230901 0.3082060 0.3583984
Make predictions:
head(predict(brt, newdata = xgb.DMatrix(X), n = 3))
[15:06:18] WARNING: amalgamation/../src/c_api/c_api.cc:785: `ntree_limit` is deprecated, use `iteration_range` instead.
[1] 3.506606 3.506606 3.506606 3.506606 3.506606 3.506606
import xgboost as xgb
from sklearn import datasets
from sklearn.preprocessing import scale
= datasets.load_iris()
iris = iris.data
data = scale(data[:,1:4])
X = data[:,0]
Y
# Parameters:
= xgb.XGBRegressor(
model = 'reg:squarederror',
objective = 2,
max_depth = 50,
n_estimators = 0).fit(X, Y) verbosity
Feature importance:
print(model.feature_importances_)
[0.08471056 0.835755 0.07953447]
Make predictions:
0:10] model.predict(X)[
array([5.0407157, 4.6844926, 4.711238 , 4.917956 , 5.0407157, 5.450946 ,
4.928966 , 4.986462 , 4.6750975, 4.917956 ], dtype=float32)
import StatsBase;
using MLJ;
= @load XGBoostRegressor pkg=XGBoost;
BRT_regressor using RDatasets;
using DataFrames;
= dataset("datasets", "iris");
iris = mapcols(StatsBase.zscore, iris[:, 2:4]);
X = iris[:, 1]; Y
Model:
= fit!(machine(BRT_regressor(), X, Y)) model
trained Machine; caches model-specific representations of data
model: XGBoostRegressor(num_round = 100, …)
args:
1: Source @893 ⏎ Table{AbstractVector{Continuous}}
2: Source @522 ⏎ AbstractVector{Continuous}
Predictions:
predict(model, X)[1:5] MLJ.
5-element Vector{Float32}:
5.1509466
4.8569074
4.551141
4.7587333
4.999504