library(kknn)
= scale(iris[,1:4])
X = iris[,5,drop=FALSE]
Y = cbind(Y, X)
data
= kknn(Species~., train = data, test = data) knn
3 k-nearest-neighbor
The k-nearest-neighbor algorithm doesn’t really learn from the data, predictions for new observations are made based on the class affiliation (or response value) of the nearest neighbors, e.g. by majority voting or averaging. The nearest neighbors are found by calculating the distance of the new observation to all observations in the train dataset.
In the following we use the ‘kknn’ package (Schliep and Hechenbichler (2016)) (Python: ‘scikit-learn’ (Pedregosa et al. (2011)), Julia: ‘MLJ’ (Blaom et al. (2019))). Different to other ML packages we can provide here already the test dataset in the fit function.
3.1 Classification
Make predictions (class probabilities):
head(knn$prob, n = 3)
setosa versicolor virginica
[1,] 1 0 0
[2,] 1 0 0
[3,] 1 0 0
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.preprocessing import scale
= datasets.load_iris()
iris = scale(iris.data)
X = iris.target
Y
= KNeighborsClassifier().fit(X, Y)
model
# Make predictions:
0:10,:] model.predict_proba(X)[
array([[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.]])
import StatsBase;
using MLJ;
= @load KNNClassifier pkg=NearestNeighborModels;
kNN_classifier using RDatasets;
using StatsBase;
using DataFrames;
= dataset("datasets", "iris");
iris = mapcols(StatsBase.zscore, iris[:, 1:4]);
X = iris[:, 5]; Y
Models:
= fit!(machine(kNN_classifier(), X, Y)) model
trained Machine; caches model-specific representations of data
model: KNNClassifier(K = 5, …)
args:
1: Source @749 ⏎ Table{AbstractVector{Continuous}}
2: Source @020 ⏎ AbstractVector{Multiclass{3}}
Predictions:
predict(model, X)[1:5] MLJ.
5-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, String, UInt8, Float64}:
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
3.2 Regression
library(e1071)
= scale(iris[,2:4])
X = cbind(iris[,1,drop=FALSE], X)
data
= kknn(Sepal.Length~., train = data, test = data) knn
Make predictions (class probabilities):
head(predict(knn), n = 3)
[1] 5.188492 4.739986 4.685332
from sklearn.neighbors import KNeighborsRegressor
from sklearn import datasets
from sklearn.preprocessing import scale
= datasets.load_iris()
iris = iris.data
data = scale(data[:,1:4])
X = data[:,0]
Y
= KNeighborsRegressor().fit(X, Y)
model
# Make predictions:
0:10] model.predict(X)[
array([5.18, 4.78, 4.68, 4.76, 4.98, 5.34, 5.06, 5.1 , 4.7 , 4.8 ])
import StatsBase;
using MLJ;
= @load KNNRegressor pkg=NearestNeighborModels;
kNN_regressor using RDatasets;
using DataFrames;
= dataset("datasets", "iris");
iris = mapcols(StatsBase.zscore, iris[:, 2:4]);
X = iris[:, 1]; Y
Model:
= fit!(machine(kNN_regressor(), X, Y)) model
trained Machine; caches model-specific representations of data
model: KNNRegressor(K = 5, …)
args:
1: Source @797 ⏎ Table{AbstractVector{Continuous}}
2: Source @974 ⏎ AbstractVector{Continuous}
Predictions:
predict(model, X)[1:5] MLJ.
5-element Vector{Float64}:
5.18
4.779999999999999
4.68
4.82
5.0200000000000005