7  Convolutional neural networks

Convolutional neural networks (CNN) are also deep neural networks but they are based on convolutional layers, which is a biologically inspired variation optimized to process image-based data (LeCun, Bengio, and Hinton (2015)). CNNs consist of two stages, in the first, the images are passed through convolutional layers and the models learns to detect edges and shapes in the images. In the second stage, the dimensions are dropped and fully-connected layers are used to classify the previously identified shapes.

In the following, we will use again the ‘keras’ package (Python: ‘keras’ (Chollet et al. (2015)); Julia: ‘Flux’ (Innes et al. (2018))) but we will not differentiate between classification and regression because the only difference would be to change the last layer and the loss function (see section ‘Deep neural networks’).

We will demonstrate the application of CNNs with the MNIST dataset which consists of handwritten digits. The objective of the CNNs is to classify the images. The MNIST dataset is one of the most famous benchmark dataset for image-based tasks (LeCun, Cortes, and Burges (2010)).

library(keras)
data = keras::dataset_mnist()
Loaded Tensorflow version 2.10.0
train = data$train
X = train$x/255
# we have to add a dimension that 
# informs the network about the channels
# of the images
X = array(X, dim = c(dim(X), 1))
YT = k_one_hot(train$y, num_classes = 10)



CNN = 
  keras_model_sequential() %>% 
  # first hidden layer
  layer_conv_2d(input_shape = list(28, 28, 1), 
                filters = 16,
                kernel_size = c(2, 2),
                activation = "relu") %>%
  layer_average_pooling_2d() %>% 
  layer_conv_2d(filters = 8,
                kernel_size = c(2, 2),
                activation = "relu") %>%
  # we use a normal DNN on top of the CNN:
  # the layer flatten will remove the additional 
  # dimensions
  layer_flatten() %>% 
  layer_dense(units = 20, 
              activation = "relu") %>%
  # 10 output neurons for 10 classes
  layer_dense(units = 10, 
              activation = "softmax")


# print architecture
summary(CNN)
Model: "sequential"
________________________________________________________________________________
 Layer (type)                       Output Shape                    Param #     
================================================================================
 conv2d_1 (Conv2D)                  (None, 27, 27, 16)              80          
 average_pooling2d (AveragePooling2  (None, 13, 13, 16)             0           
 D)                                                                             
 conv2d (Conv2D)                    (None, 12, 12, 8)               520         
 flatten (Flatten)                  (None, 1152)                    0           
 dense_1 (Dense)                    (None, 20)                      23060       
 dense (Dense)                      (None, 10)                      210         
================================================================================
Total params: 23,870
Trainable params: 23,870
Non-trainable params: 0
________________________________________________________________________________
# add loss function and optimizer
CNN %>% 
  compile(loss = loss_categorical_crossentropy,
          optimizer = optimizer_adamax(0.01))

CNN %>% 
  fit(X, YT, epochs = 3, batch_size = 125, verbose = 0)

Make predictions (class probabilites):

head(predict(CNN, X[1:100,,,,drop=FALSE]), n = 3)
             [,1]         [,2]         [,3]         [,4]         [,5]
[1,] 1.965090e-08 4.356115e-06 1.122467e-04 3.643005e-01 1.273639e-10
[2,] 9.999302e-01 2.228819e-09 6.124400e-05 3.962018e-08 2.638534e-07
[3,] 2.012695e-12 1.291264e-07 3.332906e-08 3.386993e-06 9.999701e-01
             [,6]         [,7]         [,8]         [,9]        [,10]
[1,] 6.353825e-01 6.928099e-09 1.841055e-05 1.732095e-04 8.759488e-06
[2,] 1.047530e-08 1.649719e-06 6.396242e-06 7.546749e-08 4.506550e-08
[3,] 2.200649e-11 4.529752e-10 7.846268e-06 1.812190e-06 1.660780e-05
from tensorflow import keras
from tensorflow.keras.layers import *
data = keras.datasets.mnist.load_data()
train = data[0][0]
labels = data[0][1]

# We need to one hot encode our response classes
YT = keras.utils.to_categorical(labels, num_classes = 10)

CNN = keras.Sequential()
  # first hidden layer
CNN.add(Conv2D(input_shape = [28, 28, 1], 
                filters = 16,
                kernel_size = (2, 2),
                activation = "relu"))
CNN.add(AveragePooling2D())
CNN.add(Conv2D(filters = 8,
                kernel_size = (2, 2),
                activation = "relu"))
  # we use a normal DNN on top of the CNN:
  # the layer flatten will remove the additional 
  # dimensions
CNN.add(Flatten())
  # output layer, 3 output neurons for our three classes
  # and softmax activation to get quasi probabilities 
  # that sum up to 1 for each observation
CNN.add(Dense(
  units = 10, 
  activation = "softmax"))

# print architecture
CNN.summary()

# add loss function and optimizer
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d_2 (Conv2D)           (None, 27, 27, 16)        80        
                                                                 
 average_pooling2d_1 (Averag  (None, 13, 13, 16)       0         
 ePooling2D)                                                     
                                                                 
 conv2d_3 (Conv2D)           (None, 12, 12, 8)         520       
                                                                 
 flatten_1 (Flatten)         (None, 1152)              0         
                                                                 
 dense_2 (Dense)             (None, 10)                11530     
                                                                 
=================================================================
Total params: 12,130
Trainable params: 12,130
Non-trainable params: 0
_________________________________________________________________
CNN.compile(loss = keras.losses.categorical_crossentropy,
            optimizer = keras.optimizers.Adamax(0.01))

# train model
CNN.fit(train, YT, epochs = 5, verbose = 0)
<keras.callbacks.History object at 0x7f41f81cabe0>

Make predictions:

CNN.predict(train[0:10,:,:])

1/1 [==============================] - ETA: 0s
1/1 [==============================] - 0s 40ms/step
array([[2.73058399e-11, 5.12852705e-10, 6.08049167e-10, 2.43562553e-02,
        4.99432336e-11, 9.75430012e-01, 1.32788358e-09, 1.10361043e-09,
        2.33132232e-05, 1.90236795e-04],
       [9.99909341e-01, 4.28194667e-11, 2.47524281e-06, 3.22915097e-11,
        3.69686504e-10, 3.31575795e-10, 8.78980791e-05, 1.30885678e-12,
        1.75022961e-07, 2.92207769e-08],
       [1.86242036e-10, 3.31999718e-06, 3.40689681e-08, 2.78217680e-08,
        9.99971926e-01, 2.22715540e-10, 4.35771081e-12, 2.72133593e-06,
        8.05194304e-07, 2.11609149e-05],
       [1.04308064e-07, 9.99899924e-01, 2.12908731e-06, 9.57279056e-09,
        1.54249974e-05, 6.92280056e-09, 1.16389579e-07, 4.02882006e-06,
        7.77568057e-05, 4.73492236e-07],
       [6.86259725e-08, 3.67166734e-07, 1.25773326e-07, 6.08295522e-07,
        1.08382816e-03, 1.26706354e-07, 3.40244521e-12, 1.83549821e-02,
        4.74471017e-06, 9.80555177e-01],
       [2.71790706e-12, 7.43156381e-09, 9.99988377e-01, 1.16067767e-07,
        3.00915337e-09, 1.63208232e-11, 5.44445999e-12, 1.09075728e-07,
        1.12213984e-05, 1.28492715e-07],
       [7.45668160e-07, 9.98325646e-01, 8.54519240e-05, 8.99816951e-06,
        5.18015120e-04, 7.52458902e-07, 1.64569755e-07, 4.39679582e-07,
        1.05933694e-03, 5.20451124e-07],
       [6.69499386e-11, 8.18687201e-11, 3.74586136e-07, 9.99990761e-01,
        9.71549975e-14, 1.62581344e-07, 5.61215510e-17, 2.27084040e-09,
        3.95518327e-06, 4.80054996e-06],
       [8.22694020e-08, 9.99639750e-01, 1.82648796e-06, 4.38489496e-06,
        2.37529792e-04, 7.21763058e-07, 2.96431665e-07, 8.08423101e-06,
        7.77193782e-05, 2.94992296e-05],
       [4.26150208e-08, 9.40309963e-09, 5.55570452e-08, 2.99561909e-09,
        9.99997020e-01, 7.58271113e-10, 6.39962194e-10, 5.68171856e-08,
        2.87104945e-06, 1.34309447e-11]], dtype=float32)
import StatsBase
using RDatasets
using StatsBase
using DataFrames
import MLJBase.int
using MLDatasets: MNIST
using Flux, Statistics
using Flux.Data: DataLoader
using Flux: onehotbatch, onecold, @epochs
using Flux.Losses: logitcrossentropy

Data preparation:

ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
"true"

xtrain, ytrain = MNIST(:train)[:];
xtrain = reshape(xtrain/255., 28, 28, 1, 60000);
ytrain = onehotbatch(ytrain, 0:9);

data_loader = DataLoader((xtrain, ytrain), batchsize=100, shuffle=true);

Create model (similar to Keras):

model = Chain(
  Conv((2, 2), 1=>16, pad = (1, 1), relu),
  MeanPool((2, 2)),
  Conv((2, 2), 16=>8, pad = (1, 1), relu),
  MeanPool((2, 2)),
  Flux.flatten,
  Dense(392, 20, relu),
  Dense(20, 10)
)
Chain(
  Conv((2, 2), 1 => 16, relu, pad=1),   # 80 parameters
  MeanPool((2, 2)),
  Conv((2, 2), 16 => 8, relu, pad=1),   # 520 parameters
  MeanPool((2, 2)),
  Flux.flatten,
  Dense(392 => 20, relu),               # 7_860 parameters
  Dense(20 => 10),                      # 210 parameters
)                   # Total: 8 arrays, 8_670 parameters, 34.977 KiB.

Train/optimize Model:

parameters = Flux.params(model);
optimizer = ADAM(0.01);

# Help functions
loss(x, y) = logitcrossentropy(model(x), y);

get_loss() = @show sum(logitcrossentropy(model(xtrain[:,:,:,1:100]), ytrain[:,1:100]));

## Training
for epoch in 1:1
  Flux.train!(loss, parameters, data_loader, optimizer, cb = Flux.throttle(get_loss, 6000))
end
sum(logitcrossentropy(model(xtrain[:, :, :, 1:100]), ytrain[:, 1:100])) = 2.3016892844159544

Predictions:

softmax(model(xtrain[:,:,:,1:5]))[:,1]
10-element Vector{Float64}:
 0.0949596365926727
 0.11319646009727474
 0.09622221648932856
 0.1105004159927897
 0.09568476487396564
 0.08534299720248913
 0.10008107419393342
 0.10874953336990927
 0.10192039819733163
 0.09334250299030501