6  Deep neural networks

Deep neural networks, or more precisely here fully connected neural networks, can be flexibly built which makes their application more challenging than other ML algorithms.

In the following, we use the ‘keras’ (Allaire and Chollet (2022); Chollet et al. (2015)) (Python: ‘keras’ (Chollet et al. (2015)); Julia: ‘Flux’ (Innes et al. (2018))) package which is a higher level API on the python ‘tensorflow’ framework (Abadi et al. (2016)).

6.1 Classification

library(keras)
X = scale(as.matrix(iris[,1:4]))
Y = as.integer(iris$Species)
# We need to one hot encode our response classes
YT = k_one_hot(Y-1L, num_classes = 3)

DNN = keras_model_sequential() %>% 
  # first hidden layer
  layer_dense(input_shape = ncol(X), 
              units = 10, 
              activation = "relu") %>% 
  # second hidden layer with regularization
  layer_dense(units = 20, 
              activation = "relu",
              kernel_regularizer = regularizer_l1()) %>% 
  # output layer, 3 output neurons for our three classes
  # and softmax activation to get quasi probabilities 
  # that sum up to 1 for each observation
  layer_dense(units = 3, 
              activation = "softmax")

# print architecture
summary(DNN)
Model: "sequential"
________________________________________________________________________________
 Layer (type)                       Output Shape                    Param #     
================================================================================
 dense_2 (Dense)                    (None, 10)                      50          
 dense_1 (Dense)                    (None, 20)                      220         
 dense (Dense)                      (None, 3)                       63          
================================================================================
Total params: 333
Trainable params: 333
Non-trainable params: 0
________________________________________________________________________________
# add loss function and optimizer
DNN %>% 
  compile(loss = loss_categorical_crossentropy,
          optimizer = optimizer_adamax(0.01))

# train model
DNN %>% 
  fit(X, YT, epochs = 50, verbose = 0)

Make predictions (class probabilities):

head(predict(DNN, X), n = 3)
          [,1]        [,2]         [,3]
[1,] 0.9962465 0.003589567 0.0001640228
[2,] 0.9848880 0.014879720 0.0002324550
[3,] 0.9970835 0.002787639 0.0001286939
from tensorflow import keras
from tensorflow.keras.layers import *
from sklearn import datasets
from sklearn.preprocessing import scale
iris = datasets.load_iris()
X = scale(iris.data)
Y = iris.target

# We need to one hot encode our response classes
YT = keras.utils.to_categorical(Y, num_classes = 3)

DNN = keras.Sequential()
  # first hidden layer
DNN.add(Dense(
  input_shape=[X.shape[1]], 
  units = 10, 
  activation = "relu")) 
  # second hidden layer with regularization
DNN.add(Dense(
  units = 20, 
  activation = "relu",
  kernel_regularizer = keras.regularizers.l1()))
  # output layer, 3 output neurons for our three classes
  # and softmax activation to get quasi probabilities 
  # that sum up to 1 for each observation
DNN.add(Dense(
  units = 3, 
  activation = "softmax"))

# print architecture
DNN.summary()

# add loss function and optimizer
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_3 (Dense)             (None, 10)                50        
                                                                 
 dense_4 (Dense)             (None, 20)                220       
                                                                 
 dense_5 (Dense)             (None, 3)                 63        
                                                                 
=================================================================
Total params: 333
Trainable params: 333
Non-trainable params: 0
_________________________________________________________________
DNN.compile(loss = keras.losses.categorical_crossentropy,
            optimizer = keras.optimizers.Adamax(0.01))

# train model
DNN.fit(X, YT, epochs = 50, verbose = 0)
<keras.callbacks.History object at 0x7f09a7452460>

Make predictions:

DNN.predict(X)[0:10,:]

1/5 [=====>........................] - ETA: 0s
5/5 [==============================] - 0s 940us/step
array([[9.94787872e-01, 4.17001359e-03, 1.04207790e-03],
       [9.82984900e-01, 1.54780671e-02, 1.53702230e-03],
       [9.96189654e-01, 3.01337033e-03, 7.96947046e-04],
       [9.94444370e-01, 4.62495722e-03, 9.30633105e-04],
       [9.97212470e-01, 2.00825068e-03, 7.79209717e-04],
       [9.94305611e-01, 3.97154968e-03, 1.72281812e-03],
       [9.98162746e-01, 1.16384670e-03, 6.73333008e-04],
       [9.93549705e-01, 5.33071114e-03, 1.11946405e-03],
       [9.94785070e-01, 4.37794533e-03, 8.37010215e-04],
       [9.86207247e-01, 1.25755025e-02, 1.21731847e-03]], dtype=float32)
import StatsBase
using RDatasets
using StatsBase
using DataFrames
import MLJBase.int
using Flux, Statistics
using Flux.Data: DataLoader
using Flux: onehotbatch, onecold, @epochs
using Flux.Losses: logitcrossentropy

Data preparation:

iris = dataset("datasets", "iris");
X = transpose(Matrix(mapcols(StatsBase.zscore, iris[:, 1:4])));
Y = int(iris[:, 5], type = Int);
classes = sort(unique(Y));
YT = onehotbatch(Y, classes);
data_loader = DataLoader((X, YT), batchsize=10, shuffle=true);

Create model (similar to Keras):

model = Chain(
  Dense(4, 20, relu),
  Dense(20, 20, relu),
  Dense(20, 3)
)
Chain(
  Dense(4 => 20, relu),                 # 100 parameters
  Dense(20 => 20, relu),                # 420 parameters
  Dense(20 => 3),                       # 63 parameters
)                   # Total: 6 arrays, 583 parameters, 2.652 KiB.

Train/optimize Model:

parameters = Flux.params(model);
optimizer = ADAM(0.01);

# Help functions
loss(x, y) = logitcrossentropy(model(x), y);

get_loss() = @show sum(logitcrossentropy(model(X), YT));

## Training
for epoch in 1:20
  Flux.train!(loss, parameters, data_loader, optimizer, cb = Flux.throttle(get_loss, 5))
end
sum(logitcrossentropy(model(X), YT)) = 0.9968472881302792
sum(logitcrossentropy(model(X), YT)) = 0.407915665968644
sum(logitcrossentropy(model(X), YT)) = 0.25693295877600025
sum(logitcrossentropy(model(X), YT)) = 0.1766998314541965
sum(logitcrossentropy(model(X), YT)) = 0.11782307504167162
sum(logitcrossentropy(model(X), YT)) = 0.08904269600509418
sum(logitcrossentropy(model(X), YT)) = 0.07343147452012645
sum(logitcrossentropy(model(X), YT)) = 0.06219033144462982
sum(logitcrossentropy(model(X), YT)) = 0.07896333188631052
sum(logitcrossentropy(model(X), YT)) = 0.061009181350311414
sum(logitcrossentropy(model(X), YT)) = 0.05415327509114975
sum(logitcrossentropy(model(X), YT)) = 0.06574636561214808
sum(logitcrossentropy(model(X), YT)) = 0.04392088669909506
sum(logitcrossentropy(model(X), YT)) = 0.04204886024035571
sum(logitcrossentropy(model(X), YT)) = 0.050127564760964735
sum(logitcrossentropy(model(X), YT)) = 0.05248851799584753
sum(logitcrossentropy(model(X), YT)) = 0.03977422400493945
sum(logitcrossentropy(model(X), YT)) = 0.04544570242797193
sum(logitcrossentropy(model(X), YT)) = 0.060201900874104756
sum(logitcrossentropy(model(X), YT)) = 0.0396585422150025

Predictions:

transpose(softmax(model(X)))[1:5,:]
5×3 Matrix{Float64}:
 0.999989  1.09298e-5  1.52312e-11
 0.999916  8.3711e-5   1.14199e-9
 0.999995  5.31299e-6  7.13309e-11
 0.999993  7.32351e-6  2.90119e-10
 0.999997  2.72559e-6  4.39277e-12

6.2 Regression

library(keras)
X = scale(as.matrix(iris[,2:4]))
Y = as.matrix(iris[,1,drop=FALSE])

DNN = keras_model_sequential() %>% 
  # first hidden layer
  layer_dense(input_shape = ncol(X), 
              units = 10, 
              activation = "relu") %>% 
  # second hidden layer with regularization
  layer_dense(units = 20, 
              activation = "relu",
              kernel_regularizer = regularizer_l1()) %>% 
  # output layer, one output neuron for one response
  # and no activation function
  layer_dense(units = 1)

# print architecture
summary(DNN)
Model: "sequential_2"
________________________________________________________________________________
 Layer (type)                       Output Shape                    Param #     
================================================================================
 dense_8 (Dense)                    (None, 10)                      40          
 dense_7 (Dense)                    (None, 20)                      220         
 dense_6 (Dense)                    (None, 1)                       21          
================================================================================
Total params: 281
Trainable params: 281
Non-trainable params: 0
________________________________________________________________________________
# add loss function and optimizer
DNN %>% 
  compile(loss = loss_mean_squared_error,
          optimizer = optimizer_adamax(0.01))

# train model
DNN %>% 
  fit(X, YT, epochs = 50, verbose = 0)

Make predictions:

head(predict(DNN, X), n = 3)
          [,1]
[1,] 0.3252823
[2,] 0.3261368
[3,] 0.3257285
from tensorflow import keras
from tensorflow.keras.layers import *
from sklearn import datasets
from sklearn.preprocessing import scale
iris = datasets.load_iris()
data = iris.data
X = scale(data[:,1:4])
Y = data[:,0]

DNN = keras.Sequential()
  # first hidden layer
DNN.add(Dense(
  input_shape=[X.shape[1]], 
  units = 10, 
  activation = "relu")) 
  # second hidden layer with regularization
DNN.add(Dense(
  units = 20, 
  activation = "relu",
  kernel_regularizer = keras.regularizers.l1()))
  # output layer, 3 output neurons for our three classes
  # and softmax activation to get quasi probabilities 
  # that sum up to 1 for each observation
DNN.add(Dense(
  units = 1, 
  activation = None))

# print architecture
DNN.summary()

# add loss function and optimizer
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_9 (Dense)             (None, 10)                40        
                                                                 
 dense_10 (Dense)            (None, 20)                220       
                                                                 
 dense_11 (Dense)            (None, 1)                 21        
                                                                 
=================================================================
Total params: 281
Trainable params: 281
Non-trainable params: 0
_________________________________________________________________
DNN.compile(loss = keras.losses.mean_squared_error,
            optimizer = keras.optimizers.Adamax(0.01))

# train model
DNN.fit(X, Y, epochs = 50, verbose = 0)
<keras.callbacks.History object at 0x7f0873635d30>

Make predictions:

DNN.predict(X)[0:10]

1/5 [=====>........................] - ETA: 0s
5/5 [==============================] - 0s 639us/step
array([[5.1530886],
       [4.635344 ],
       [4.844628 ],
       [4.718408 ],
       [5.2458243],
       [5.3749986],
       [5.0117435],
       [5.046633 ],
       [4.583274 ],
       [4.761703 ]], dtype=float32)
import StatsBase
using RDatasets
using StatsBase
using DataFrames
import MLJBase.int
using Flux, Statistics
using Flux.Data: DataLoader
using Flux: onehotbatch, onecold, @epochs
using Flux.Losses: mse

Data preparation:

iris = dataset("datasets", "iris");
X = transpose(Matrix(mapcols(StatsBase.zscore, iris[:, 2:4])));
YT = iris[:, 1];
YT = reshape(YT, 1, length(YT));

data_loader = DataLoader((X, YT), batchsize=10, shuffle=true);

Create model (similar to Keras):

model = Chain(
  Dense(3, 20, relu),
  Dense(20, 20, relu),
  Dense(20, 1)
)
Chain(
  Dense(3 => 20, relu),                 # 80 parameters
  Dense(20 => 20, relu),                # 420 parameters
  Dense(20 => 1),                       # 21 parameters
)                   # Total: 6 arrays, 521 parameters, 2.410 KiB.

Train/optimize Model:

parameters = Flux.params(model);
optimizer = ADAM(0.01);

# Help functions
loss(x, y) = mse(model(x), y);

get_loss() = @show sum(mse(model(X), YT));

## Training
for epoch in 1:20
  Flux.train!(loss, parameters, data_loader, optimizer, cb = Flux.throttle(get_loss, 5))
end
sum(mse(model(X), YT)) = 35.254011263534736
sum(mse(model(X), YT)) = 12.46171269327014
sum(mse(model(X), YT)) = 2.2635190110050605
sum(mse(model(X), YT)) = 1.2401000963017133
sum(mse(model(X), YT)) = 0.6912489741242238
sum(mse(model(X), YT)) = 0.5464189172742467
sum(mse(model(X), YT)) = 0.39399982040159687
sum(mse(model(X), YT)) = 0.2799709648562455
sum(mse(model(X), YT)) = 0.22626626969525546
sum(mse(model(X), YT)) = 0.19594999439068672
sum(mse(model(X), YT)) = 0.15475237595663777
sum(mse(model(X), YT)) = 0.15207562095418492
sum(mse(model(X), YT)) = 0.1366212980594906
sum(mse(model(X), YT)) = 0.14285041851790095
sum(mse(model(X), YT)) = 0.12735811524649807
sum(mse(model(X), YT)) = 0.11084125588102431
sum(mse(model(X), YT)) = 0.10984711694555327
sum(mse(model(X), YT)) = 0.10183028533968545
sum(mse(model(X), YT)) = 0.10055789367635456
sum(mse(model(X), YT)) = 0.10006587279868089

Predictions:

transpose(model(X))[1:5]
5-element Vector{Float64}:
 5.078324422496932
 4.708044600690088
 4.817372548540471
 4.800092365508
 5.157734536850637