library(ranger)
= iris[,1:4]
X = iris[,5,drop=FALSE]
Y = cbind(Y, X)
data
= ranger(Species~., data = data, probability = TRUE, importance = "impurity") rf
4 Random forest
The random forest (RF) algorithm is probably one of the most famous ML algorithms, and not without reason. Compared to other well performing algorithms, the RF algorithm has only a few hyper-parameters and because of the bagging and the random sampling of available variables in for the node splits, it has a well working internal complexity adaption.
In the following, we use the ‘ranger’ package (Wright and Ziegler (2017)) (Python: ‘scikit-learn’ (Pedregosa et al. (2011)), Julia: ‘MLJ’ (Blaom et al. (2019))).
4.1 Classification
Show feature importances:
importance(rf)
Sepal.Length Sepal.Width Petal.Length Petal.Width
9.067816 1.358848 41.845718 43.340283
Make predictions (class probabilities):
head(predict(rf, data = data)$predictions, n = 3)
setosa versicolor virginica
[1,] 1.0000000 0.0000000000 0.0000000000
[2,] 0.9995556 0.0002222222 0.0002222222
[3,] 1.0000000 0.0000000000 0.0000000000
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.preprocessing import scale
= datasets.load_iris()
iris = scale(iris.data)
X = iris.target
Y
= RandomForestClassifier().fit(X, Y) model
Feature importance
print(model.feature_importances_)
[0.1047826 0.02722545 0.43783582 0.43015613]
Make predictions:
0:10,:] model.predict_proba(X)[
array([[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.]])
import StatsBase;
using MLJ;
= @load RandomForestClassifier pkg=DecisionTree;
RF_classifier using RDatasets;
using StatsBase;
using DataFrames;
= dataset("datasets", "iris");
iris = mapcols(StatsBase.zscore, iris[:, 1:4]);
X = iris[:, 5]; Y
Models:
= fit!(machine(RF_classifier(), X, Y)) model
trained Machine; caches model-specific representations of data
model: RandomForestClassifier(max_depth = -1, …)
args:
1: Source @613 ⏎ Table{AbstractVector{Continuous}}
2: Source @784 ⏎ AbstractVector{Multiclass{3}}
Feature importance:
feature_importances(model)
4-element Vector{Pair{Symbol, Float64}}:
:PetalLength => 0.51551371534272
:PetalWidth => 0.3981451261378913
:SepalLength => 0.06998182233047624
:SepalWidth => 0.01635933618891258
Predictions:
predict(model, X)[1:5] MLJ.
5-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, String, UInt8, Float64}:
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
4.2 Regression
library(ranger)
= iris[,2:4]
X = cbind(iris[,1,drop=FALSE], X)
data
= ranger(Sepal.Length~., data = data, importance = "impurity") rf
Show feature importances:
importance(rf)
Sepal.Width Petal.Length Petal.Width
11.72733 46.86181 37.13289
Make predictions (class probabilities):
head(predict(rf, data = data)$predictions, n = 3)
[1] 5.104768 4.774441 4.649346
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.preprocessing import scale
= datasets.load_iris()
iris = iris.data
data = scale(data[:,1:4])
X = data[:,0]
Y
= RandomForestRegressor().fit(X, Y) model
Feature importance:
print(model.feature_importances_)
[0.07991512 0.85830207 0.06178281]
Make predictions:
0:10] model.predict(X)[
array([5.106 , 4.8205 , 4.57298571, 4.76945 , 5.017 ,
5.429 , 4.80283333, 5.06201667, 4.5855 , 4.856 ])
import StatsBase;
using MLJ;
= @load RandomForestRegressor pkg=DecisionTree;
RF_regressor using RDatasets;
using DataFrames;
= dataset("datasets", "iris");
iris = mapcols(StatsBase.zscore, iris[:, 2:4]);
X = iris[:, 1]; Y
Model:
= fit!(machine(RF_regressor(), X, Y)) model
trained Machine; caches model-specific representations of data
model: RandomForestRegressor(max_depth = -1, …)
args:
1: Source @316 ⏎ Table{AbstractVector{Continuous}}
2: Source @129 ⏎ AbstractVector{Continuous}
Feature importance:
feature_importances(model)
3-element Vector{Pair{Symbol, Float64}}:
:PetalLength => 0.6626304609310221
:PetalWidth => 0.23647943010293143
:SepalWidth => 0.10089010896604662
Predictions:
predict(model, X)[1:5] MLJ.
5-element Vector{Float64}:
5.1000000000000005
4.659999999999999
4.62
4.720000000000001
5.0600000000000005