Image Recognition in R

image recognition deep learning transfer learning keras r

In this post we will explore image classification in keras for several datasets and how transfer learning can be readily applied to improve well known models.

Blake Conrad https://github.com/conradbm
2022-07-13

Recognize Handwritten Digits

Computer vision as a sub-field of deep learning has exploaded over the last decade. The advent of better computers, readily available data sources, and explosively intelligent models with very little code has made the unthinkable doable, and quickly.

Libraries

First we will grab the MNIST dataset. This consists of an array of 28x28 images with 10 classification labels.

Data Retrieval

mnist %>% names
[1] "train" "test" 

We can save the shapes and number of classes for later.

Data Shapes

# Get the width and height
WIDTH = dim(mnist$train$x)[[2]]
HEIGHT = dim(mnist$train$x)[[3]]


# Get unique number of classes
CLASSES = length(unique(mnist$train$y))

mnist$train$x %>% dim
[1] 60000    28    28
mnist$train$y %>% dim
[1] 60000
mnist$test$x %>% dim
[1] 10000    28    28
mnist$test$y %>% dim
[1] 10000

Visualize Images

Next we can visualize a few images using the plot function in r. This was a little weird at first, because the images sometimes need standardized for rgb values depending on the function and data shape.

library(raster)
plot_a_few <- function(x,y, a_few = 3, rgb_dim=FALSE){
    # Render a few images
    rand_image_index = sample(1:dim(x)[[1]], size = a_few)
    par(mar=c(0, 0, 0, 0))
    for(i in rand_image_index){
      if(rgb_dim){
        img = x[i,,,]
      }
      else{
        img = x[i,,]
        # image(img, useRaster=TRUE, axes=FALSE)
      }
      
      plot(as.raster(img))
      label = y[i]
      print(label)
    }
}

plot_a_few(mnist$train$x, mnist$train$y, a_few=3)

[1] 3

[1] 2

[1] 5

Model

Simple Dense Model

The simplest model will take the image tensor and flatten it into the standard feed forward format. The prediction is over our CLASSES which is 10.

# Simple model
model <- keras::keras_model_sequential() %>% 
            keras::layer_flatten(input_shape = c(WIDTH, HEIGHT), 
                                 name = "mnist_flatten_input") %>% 
            keras::layer_dense(units = 128, activation = "relu", 
                               name = "mnist_dense") %>% 
            keras::layer_dropout(0.2, name = "mnist_dropout") %>% 
            keras::layer_dense(CLASSES, activation = "softmax", 
                               name = "mnist_dense_output")
model
Model: "sequential"
______________________________________________________________________
Layer (type)                   Output Shape                Param #    
======================================================================
mnist_flatten_input (Flatten)  (None, 784)                 0          
______________________________________________________________________
mnist_dense (Dense)            (None, 128)                 100480     
______________________________________________________________________
mnist_dropout (Dropout)        (None, 128)                 0          
______________________________________________________________________
mnist_dense_output (Dense)     (None, 10)                  1290       
======================================================================
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
______________________________________________________________________

Summary

# Some summary statistics
base::summary(model)
Model: "sequential"
______________________________________________________________________
Layer (type)                   Output Shape                Param #    
======================================================================
mnist_flatten_input (Flatten)  (None, 784)                 0          
______________________________________________________________________
mnist_dense (Dense)            (None, 128)                 100480     
______________________________________________________________________
mnist_dropout (Dropout)        (None, 128)                 0          
______________________________________________________________________
mnist_dense_output (Dense)     (None, 10)                  1290       
======================================================================
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
______________________________________________________________________

Compile

Reminder that sparse_categorical_crossentropy is for non-matrix like y values. This will do it for you. Otherwise, you need to use the to_categorical function to transform the y vector into a matrix.

# Compile the model
model %>% 
  keras::compile(
    loss = "sparse_categorical_crossentropy",
    optimizer = "adam",
    metrics = "accuracy"
  )

Fit

# Fit the model
history = model %>% 
            keras::fit(
              x = mnist$train$x, y = mnist$train$y,
              epochs = 5,
              validation_split = 0.3,
              verbose = 2
            )

Visualize

plot_history_metrics = function(history){
    # Plot fit results - loss and accuracy for this model
    tmp = data.frame(history$metrics) %>% dplyr::mutate(epoch = row_number())
    plt1 = ggplot(data=tmp) +
            geom_line(aes(x=epoch, y = loss, color="training loss")) +
            geom_line(aes(x=epoch, y = val_loss, color="validation loss")) +
            theme_bw() +
            labs(color="Legend") + 
            ggtitle("Model Loss")
    plt1
    
    
    plt2 = ggplot(data=tmp) +
            geom_line(aes(x=epoch, y = accuracy, color="training accuracy")) +
            geom_line(aes(x=epoch, y = val_accuracy, color="validation accuracy")) +
            theme_bw() +
            labs(color="Legend") + 
            ggtitle("Model Accuracy")
    plt2
    
    list(loss_plot = plt1, acc_plot = plt2)
}

plot_history_metrics(history)
$loss_plot


$acc_plot

Another way

Another equally valid way as oppose to flattening the input as an array is to do it explicitely on the outside. This can be done use the array_reshape function. We can also make our y values into a categorical matrix using the to_Categorical function. This will change our sparse_categorical_crossentropy into categorical_crossentropy. A tricky distinction, but one doesn’t expect a matrix, one does.

x_train <- keras::array_reshape(mnist$train$x, c(nrow(mnist$train$x), WIDTH*HEIGHT))
x_test <- keras::array_reshape(mnist$test$x, c(nrow(mnist$test$x), WIDTH*HEIGHT))
y_train <- keras::to_categorical(mnist$train$y, 10)
y_test <- keras::to_categorical(mnist$test$y, 10)

x_test %>% dim
[1] 10000   784
y_test %>% head
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]    0    0    0    0    0    0    0    1    0     0
[2,]    0    0    1    0    0    0    0    0    0     0
[3,]    0    1    0    0    0    0    0    0    0     0
[4,]    1    0    0    0    0    0    0    0    0     0
[5,]    0    0    0    0    1    0    0    0    0     0
[6,]    0    1    0    0    0    0    0    0    0     0
# Model pre-flattened for shape and made categorically long in y
model <- keras::keras_model_sequential() %>%  
            keras::layer_dense(input_shape = c(WIDTH*HEIGHT), 
                               units = 128, 
                               activation = "relu", 
                               name = "mnist_dense") %>% 
            keras::layer_dropout(0.2, 
                                 name = "mnist_dropout") %>% 
            keras::layer_dense(CLASSES, 
                               activation = "softmax", 
                               name = "mnist_dense_output")
model
Model: "sequential_1"
______________________________________________________________________
Layer (type)                   Output Shape                Param #    
======================================================================
mnist_dense (Dense)            (None, 128)                 100480     
______________________________________________________________________
mnist_dropout (Dropout)        (None, 128)                 0          
______________________________________________________________________
mnist_dense_output (Dense)     (None, 10)                  1290       
======================================================================
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
______________________________________________________________________

Summary

# Model architectures
base::summary(model)
Model: "sequential_1"
______________________________________________________________________
Layer (type)                   Output Shape                Param #    
======================================================================
mnist_dense (Dense)            (None, 128)                 100480     
______________________________________________________________________
mnist_dropout (Dropout)        (None, 128)                 0          
______________________________________________________________________
mnist_dense_output (Dense)     (None, 10)                  1290       
======================================================================
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
______________________________________________________________________

Compile

# Compile the model
model %>% 
  keras::compile(
    # loss = "sparse_categorical_crossentropy",
    loss = "categorical_crossentropy",
    optimizer = "adam",
    metrics = "accuracy"
  )

Fit

Once we configure our model, we can compile it, fit, then plot to see the performance. Turns out, you can just do plot(history) and the function to plot these metrics is entirely superfluous.

history = model %>% keras::fit(
  x = x_train, y = y_train,
  validation_split = 0.3,
  epochs = 5,
  verbose = 2
)

Visualize

plot_history_metrics = function(history){
    # Plot fit results - loss and accuracy for this model
    tmp = data.frame(history$metrics) %>% dplyr::mutate(epoch = row_number())
    plt1 = ggplot(data=tmp) +
            geom_line(aes(x=epoch, y = loss, color="training loss")) +
            geom_line(aes(x=epoch, y = val_loss, color="validation loss")) +
            theme_bw() +
            labs(color="Legend") + 
            ggtitle("Model Loss")
    plt1
    
    
    plt2 = ggplot(data=tmp) +
            geom_line(aes(x=epoch, y = accuracy, color="training accuracy")) +
            geom_line(aes(x=epoch, y = val_accuracy, color="validation accuracy")) +
            theme_bw() +
            labs(color="Legend") + 
            ggtitle("Model Accuracy")
    plt2
    
    list(loss_plot = plt1, acc_plot = plt2)
}

plot_history_metrics(history)
$loss_plot


$acc_plot

Predictions

# Generate some predictions on the unseen data
predictions = stats::predict(model, x_test)
predictions %>% head()
             [,1]         [,2]         [,3]         [,4]         [,5]
[1,] 2.147511e-06 3.081970e-08 1.363529e-05 6.959682e-04 1.883129e-09
[2,] 4.554735e-07 1.738241e-05 9.998441e-01 5.303881e-05 3.181212e-13
[3,] 3.884773e-06 9.974592e-01 1.834323e-04 8.748658e-05 1.106339e-04
[4,] 9.970744e-01 2.236041e-06 1.048536e-04 6.934661e-05 1.657578e-04
[5,] 2.274211e-06 8.666921e-09 7.863012e-06 9.704200e-08 9.973132e-01
[6,] 3.012697e-07 9.972008e-01 3.240131e-06 6.145242e-06 1.901888e-05
             [,6]         [,7]         [,8]         [,9]        [,10]
[1,] 1.727558e-06 8.484718e-11 9.990858e-01 1.901059e-06 1.988278e-04
[2,] 5.569929e-05 2.090738e-07 1.970074e-10 2.905491e-05 5.533708e-11
[3,] 6.059188e-06 1.315140e-04 1.861338e-03 1.532390e-04 3.115008e-06
[4,] 6.942511e-06 1.050111e-03 8.822395e-04 5.042305e-06 6.392266e-04
[5,] 1.149999e-05 3.039010e-06 5.597093e-05 1.561662e-06 2.604362e-03
[6,] 3.678756e-08 1.334996e-06 2.748028e-03 1.928041e-05 1.795054e-06

Evaluate

# Evaluate performance
# test_results = model %>% 
#                   evaluate(mnist$test$x, mnist$test$y, verbose = 0)
# test_results

Save Model

One thing keras makes incredibly easy is the ability to save your model. This will create a folder and allow for easy access to and from your model if you need it for predictions in another environment or API.

# Serialize the model (it becomes a folder)
# keras::save_model_tf(object = model, filepath = "mnist_model")

Reload Model

# Reload the model
# reloaded_model = keras::load_model_tf("mnist_model")
# reloaded_model %>% summary
# base::all.equal(stats::predict(model, x_test), 
#                 stats::predict(reloaded_model, x_test))

Recognize Fashion

Recognizing other types of objects is just as easy as before. Let’s repeat our steps for a new dataset, because practice makes perfect!

Load the data

fashion_mnist <- dataset_fashion_mnist()

c(train_images, train_labels) %<-% fashion_mnist$train
c(test_images, test_labels) %<-% fashion_mnist$test
class_names = c('T-shirt/top',
                'Trouser',
                'Pullover',
                'Dress',
                'Coat', 
                'Sandal',
                'Shirt',
                'Sneaker',
                'Bag',
                'Ankle boot')
dim(train_images)
[1] 60000    28    28
dim(train_labels)
[1] 60000
train_labels[1:20]
 [1] 9 0 0 3 0 2 7 2 5 5 0 9 5 5 7 9 1 0 6 4
dim(test_images)
[1] 10000    28    28
dim(test_labels)
[1] 10000

Preprocess the data

library(tidyr)
library(ggplot2)


image_1 <- as.data.frame(train_images[1,,])
colnames(image_1) <- seq_len(ncol(image_1))
image_1$y <- seq_len(nrow(image_1))
image_1 <- tidyr::gather(image_1, key = "x", value = "value", -y)
image_1$x <- as.integer(image_1$x)

ggplot(image_1, aes(x=x,y=y,fill=value)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "black", na.value = NA) +
  scale_y_reverse() +
  theme_minimal() +
  theme(panel.grid = element_blank()) +
  theme(aspect.ratio = 1) +
  xlab("") +
  ylab("") +
  ggtitle(paste(class_names[train_labels[1]+1]))

train_images <- train_images / 255
test_images <- test_images / 255

par(mfcol = c(5,5))
par(mar=c(0,0,1.5,0), axs='i', yaxs='i')
for(i in 1:25){
  img <- train_images[i,,]
  # img <- t(apply(img, 2, rev))
  image(1:28, 1:28, img, col = gray((0:255)/255), xaxt = 'n', yaxt = 'n', main = paste(class_names[train_labels[i]+1]))
}

Build the model

model <- keras_model_sequential() %>%
  layer_flatten(input_shape = c(28, 28)) %>% 
  layer_dense(units = 128, activation = 'relu') %>% 
  layer_dense(units = 10, activation = 'softmax')

model %>% compile(
  optimizer = 'adam', 
  loss = 'sparse_categorical_crossentropy',
  metrics = c('accuracy')
)

model %>% 
  fit(x=train_images, y=train_labels, 
      epochs = 5, verbose = 2, validation_split=0.3)

Evaluate Accuracy

score <- model %>% evaluate(test_images, test_labels, verbose = 0)

cat('Test loss:', score[1], "\n")
Test loss: 0.3637977 
cat('Test accuracy:', score[2], "\n")
Test accuracy: 0.8708 

Make predictions

predictions <- model %>% predict(test_images)
predictions %>% head
             [,1]         [,2]         [,3]         [,4]         [,5]
[1,] 5.944940e-06 3.568362e-09 9.062223e-07 3.813431e-07 7.632132e-07
[2,] 6.902370e-05 6.028386e-09 9.972779e-01 5.225142e-08 3.154729e-04
[3,] 5.386319e-06 9.999939e-01 1.031454e-08 3.366561e-07 2.402087e-07
[4,] 9.895704e-06 9.999171e-01 3.511240e-07 6.675994e-05 4.046826e-06
[5,] 1.955759e-01 2.231707e-04 2.183121e-01 8.242820e-03 1.920183e-02
[6,] 1.318200e-03 9.986019e-01 8.571616e-06 8.032462e-06 3.114295e-05
             [,6]         [,7]         [,8]         [,9]        [,10]
[1,] 1.908284e-02 1.337585e-05 5.273721e-02 1.333138e-04 9.280253e-01
[2,] 3.593275e-11 2.337541e-03 3.329948e-12 3.700688e-08 2.485134e-13
[3,] 1.300379e-13 1.142511e-07 2.336990e-14 3.351031e-08 7.711556e-12
[4,] 4.459056e-11 1.710582e-06 8.355450e-12 1.272454e-07 1.614447e-09
[5,] 5.498093e-05 5.563514e-01 2.239543e-05 2.010975e-03 4.317736e-06
[6,] 2.356733e-10 3.082445e-05 1.095561e-10 1.486991e-06 3.072765e-09
preds = apply(predictions, 1, which.max)
preds %>% head
[1] 10  3  2  2  7  2
#or

preds = model %>% predict_classes(x = test_images)
preds %>% unique
 [1] 9 2 1 6 4 5 7 3 8 0

How well did we do?

par(mfcol=c(5,5))
par(mar=c(0, 0, 1.5, 0), xaxs='i', yaxs='i')
for (i in 1:25) { 
  img <- test_images[i, , ]
  img <- t(apply(img, 2, rev)) 
  # subtract 1 as labels go from 0 to 9
  predicted_label <- which.max(predictions[i, ]) - 1
  true_label <- test_labels[i]
  if (predicted_label == true_label) {
    color <- '#008800' 
  } else {
    color <- '#bb0000'
  }
  image(1:28, 1:28, img, col = gray((0:255)/255), xaxt = 'n', yaxt = 'n',
        main = paste0(class_names[predicted_label + 1], " (",
                      class_names[true_label + 1], ")"),
        col.main = color)
}

train_images %>% dim
[1] 60000    28    28

Recognize Animals and Objects

library(tensorflow)
library(keras)

cifar <- dataset_cifar10()

class_names <- c('airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck')

index <- 1:30

par(mfcol = c(5,6), mar = rep(1,4), oma=rep(0.2, 4))
cifar$train$x[index,,,] %>% 
  purrr::array_tree(margin=1) %>% 
  purrr::set_names(class_names[cifar$train$y[index] + 1]) %>% 
  purrr::map(as.raster, max = 255) %>% 
  purrr::iwalk(~{plot(.x); title(.y)})

Convolutional Neural Network

model <- keras_model_sequential() %>% 
  layer_conv_2d(input_shape = c(32, 32, 3),  filters = 32, kernel_size = c(3,3), activation = "relu") %>% 
  layer_max_pooling_2d(pool_size = c(2,2)) %>% 
  layer_conv_2d(filters = 64, kernel_size = c(3,3), activation = "relu") %>% 
  layer_max_pooling_2d(pool_size = c(2,2)) %>% 
  layer_conv_2d(filters = 64, kernel_size = c(3,3), activation = "relu") %>% 
  layer_flatten() %>% 
  layer_dense(units = 64, activation = "relu") %>% 
  layer_dense(units = 10, activation = "softmax")

summary(model)
Model: "sequential_3"
______________________________________________________________________
Layer (type)                   Output Shape                Param #    
======================================================================
conv2d_2 (Conv2D)              (None, 30, 30, 32)          896        
______________________________________________________________________
max_pooling2d_1 (MaxPooling2D) (None, 15, 15, 32)          0          
______________________________________________________________________
conv2d_1 (Conv2D)              (None, 13, 13, 64)          18496      
______________________________________________________________________
max_pooling2d (MaxPooling2D)   (None, 6, 6, 64)            0          
______________________________________________________________________
conv2d (Conv2D)                (None, 4, 4, 64)            36928      
______________________________________________________________________
flatten_1 (Flatten)            (None, 1024)                0          
______________________________________________________________________
dense_3 (Dense)                (None, 64)                  65600      
______________________________________________________________________
dense_2 (Dense)                (None, 10)                  650        
======================================================================
Total params: 122,570
Trainable params: 122,570
Non-trainable params: 0
______________________________________________________________________
model %>% compile(
  optimizer = "adam",
  loss = "sparse_categorical_crossentropy",
  metrics = "accuracy"
)


history <- model %>% 
  fit(
    x = cifar$train$x, y = cifar$train$y,
    epochs = 10,
    validation_data = unname(cifar$test),
    verbose = 2
  )
evaluate(model, cifar$test$x, cifar$test$y, verbose = 0)
    loss accuracy 
1.069247 0.656700 

Transfer Learning

We will not actually train here, because it takes about 45 minutes. But we will just unload the model previously trained. But the idea is to take a layer or many layers from a previouos model and then stack our model on top of it. You don’t have to stack it “on top”, but I chose to here for simplicity. By taking what the previous model learned, we then put our custom output layers there so it can learn to classify new things, with old feature vectors it learned from the imagenet data set.

library(devtools)
library(tfhub)
library(keras)
library(reticulate)

c(train_images, train_labels) %<-% cifar$train
c(test_images, test_labels) %<-% cifar$test

train_images %>% dim
[1] 50000    32    32     3
train_labels %>% dim
[1] 50000     1
test_images %>% dim
[1] 10000    32    32     3
test_labels %>% dim  
[1] 10000     1
image_shape <- c(32,32,3)

conv_base <- keras::application_resnet101(weights = "imagenet",
                                          include_top = FALSE, 
                                          input_shape = c(32,32,3))


freeze_weights(conv_base)

model <- keras_model_sequential() %>%
  conv_base %>%
  layer_flatten() %>%
  # layer_reshape(c(1,2048)) %>% 
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 10, activation = "softmax")


model %>% compile(
  optimizer = "adam",
  loss = "sparse_categorical_crossentropy",
  metrics = "accuracy"
)


# unfreeze_weights(conv_base, from = "block5_conv1")

history <- model %>% fit(
  x=train_images, y=train_labels,
  validation_split = 0.3,
  epochs=3,
  verbose = 2
)

# model = keras::load_model_tf("cifar10_tl_model")
# model %>% summary

# summary(model)
# train_images[1,,,] %>% dim
# train_labels[1]

Visualize performance

# plot(history)

Save the model

The following code is used to serialize the model, since this is already done and the process is fairly intensive, we will not be repeating it here.

# # Serialize the model (it becomes a folder)
# keras::save_model_tf(object = model, filepath = "cifar10_tl_model")
# 
# # Reload the model
# reloaded_model = keras::load_model_tf("cifar10_tl_model")
# reloaded_model %>% summary

Evaluate the model

evaluate(model, x = test_images, y = test_labels)
    loss accuracy 
1.188451 0.592300 

References

https://tensorflow.rstudio.com/tutorials/beginners/ https://tensorflow.rstudio.com/tutorials/advanced/images/cnn/ https://tensorflow.rstudio.com/tutorials/advanced/images/transfer-learning-hub/ https://keras.rstudio.com/reference/freeze_layers.html