Create Custom ML Models using Keras in R

machine learning deep learning non-linear programming automatic differentiation keras r

In this post we will explore automatic differentiation as applied to custom machine learning model generation using keras in R.

Blake Conrad https://github.com/conradbm
2022-07-18

Building Custom Models

In the post on Automatic Differentiation with Keras in R there was a walk through some of the documentation and an example of using the GradientTape API. There is no creativity in this post, just a repeat from the keras documentation. The goal here is to take an example, play around with it, and explore the API a bit.

Fitting a linear model

library(tensorflow)
Model <- R6::R6Class(
  classname = "Model",
  public = list(
    W = NULL,
    b = NULL,
    
    initialize = function(){
      self$W <- tf$Variable(5) # Initialize weight to 5
      self$b <- tf$Variable(0) # Initialize bias to 0 
      
    },
    
    call = function(x){
      self$W*x + self$b
    }
  )
)
model <- Model$new()
model$call(3)
tf.Tensor(15.0, shape=(), dtype=float32)

Define a loss

loss <- function(y_pred, y_true) {
  tf$reduce_mean(tf$square(y_pred - y_true))
}

Obtain training data

TRUE_W = 3.0
TRUE_b = 2.0

NUM_EXAMPLES = 1000

# Random 1-d vector of inputs
inputs = tf$random$normal(shape = shape(NUM_EXAMPLES))
# Random 1-d noise
noise = tf$random$normal(shape = shape(NUM_EXAMPLES))

# True 1-d output target population
outputs = inputs * TRUE_W + TRUE_b + noise
library(dplyr)
library(ggplot2)
df = tibble(
  inputs = as.numeric(inputs),
  outputs = as.numeric(outputs),
  predicted = as.numeric(model$call(inputs))
) 

df %>% ggplot(., aes(x=inputs)) +
  geom_point(aes(y = outputs)) +
  geom_line(aes(y = predicted), color = "blue")

Define a training loop

train <- function(model, inputs, outputs, learning_rate) {
  with (tf$GradientTape() %as% t, {
    current_loss = loss(model$call(inputs), outputs)
  })
  
  d <- t$gradient(current_loss, list(model$W, model$b))
  
  model$W$assign_sub(learning_rate * d[[1]])
  model$b$assign_sub(learning_rate * d[[2]])
  current_loss
}
library(glue)

model <- Model$new()

Ws <- bs <- c()

for(epoch in seq_len(20)){
  
  Ws[epoch] <- as.numeric(model$W)
  bs[epoch] <- as.numeric(model$b)
  
  current_loss <- train(model, inputs, outputs, learning_rate = 0.1)
  cat(glue::glue("Epoch: {epoch}, Loss: {as.numeric(current_loss)}"), "\n")
}
Epoch: 1, Loss: 9.27498435974121 
Epoch: 2, Loss: 6.29132127761841 
Epoch: 3, Loss: 4.38681507110596 
Epoch: 4, Loss: 3.17101907730103 
Epoch: 5, Loss: 2.39479780197144 
Epoch: 6, Loss: 1.89916634559631 
Epoch: 7, Loss: 1.58266150951385 
Epoch: 8, Loss: 1.38052105903625 
Epoch: 9, Loss: 1.25140619277954 
Epoch: 10, Loss: 1.16892516613007 
Epoch: 11, Loss: 1.11622834205627 
Epoch: 12, Loss: 1.08255589008331 
Epoch: 13, Loss: 1.0610374212265 
Epoch: 14, Loss: 1.04728376865387 
Epoch: 15, Loss: 1.0384920835495 
Epoch: 16, Loss: 1.03287136554718 
Epoch: 17, Loss: 1.02927708625793 
Epoch: 18, Loss: 1.0269787311554 
Epoch: 19, Loss: 1.02550864219666 
Epoch: 20, Loss: 1.02456820011139 
library(tidyr)

# Variables per column
df = tibble(
  epoch = 1:20,
  Ws = Ws,
  bs = bs
)

# Melt on multiple columns! Name their similarity..
df = df %>% tidyr::pivot_longer(c(Ws, bs), 
                           names_to = "parameter", 
                           values_to = "estimate")

ggplot(df, aes(x=epoch, y=estimate)) +
  geom_line() +
  facet_wrap(~parameter, scales = "free")

Custom Logistic Regression

Now let’s build a custom logistic regression model.

# Generate data
df <- data.frame(matrix(rnorm(1000), nrow=100, ncol=2))
df$Y <- rbinom(n = 100, size = 1, prob = 0.25)

X <- df %>% 
  select(-Y)
y <- df %>% select(Y) %>% 
  dplyr::pull(Y) %>% 
  as.matrix

Notice the data is collected from the following distributions:

Independent Variables \[ X \sim \mathcal{N}(\mu=0,\,\sigma^{2}=1)\ \] Dependent Variables \[ y \sim \mathcal{B}(n=1, p=0.25)\ \] ### Build the model

library(tensorflow)
library(dplyr)

Model <- R6::R6Class(
  classname = "Model",
  public = list(
    W = NULL,
    b = NULL,
    
    initialize = function(D){
      
      # Manually runs great, but won't knit..
      self$W <- tf$Variable(matrix(rnorm(n = D), nrow=1, ncol=D))
      self$b <- tf$Variable(matrix(rnorm(n = 1), nrow=1, ncol=1))

    },
    
    call = function(x){
      1/(1 + exp(-(tf$matmul(x, t(self$W)) + self$b)))
    }
  )
)


model <- Model$new(ncol(X))
model$call(X) %>% head
tf.Tensor(
[[0.90854418]
 [0.94750512]
 [0.72264755]
 [0.69501559]
 [0.97674013]
 [0.85342092]], shape=(6, 1), dtype=float64)

\[ L(X; W, b) = \prod_{i=1}^n{\hat y_{i}^{y_{i}} (1-\hat y_{i})^{1-y_{i}} } \]

Log likelihood

# Monotonic increasing, value function
log_likelihood <- function(y_pred, y_true){
  ll <- tf$reduce_sum(y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred))
  ll
}

# y_pred = model$call(X)

# log_likelihood(model$call(X), y)

Train

# Generate data
df <- data.frame(matrix(rnorm(1000), nrow=100, ncol=2))
df$Y <- rbinom(n = 100, size = 1, prob = 0.25)

X <- df %>% 
  select(-Y)
y <- df %>% select(Y) %>% 
  dplyr::pull(Y) %>% 
  as.matrix


D = ncol(X)

# Create a fresh model
learning_rate = 0.1
W = tf$Variable(matrix(rnorm(D), nrow=1, ncol=D))
b = tf$Variable(matrix(rnorm(1), nrow=1, ncol=1))

sigmoid = function(z){
  1/(1+exp(-z))
}

predict.sigmoid = function(x, W, b){
  y_pred = sigmoid(tf$matmul(X, t(W)) + b)
  y_pred
}

losses <- c()

for(epoch in seq_len(20)){
  
  
  with (tf$GradientTape() %as% t, {
    t$watch(W)
    t$watch(b)
    y_pred = predict.sigmoid(X, W, b)
    
    L = log_likelihood(y_pred, y)
  })
  
  d <- t$gradient(L, list(W, b))
    
  # Gradient ascent
  W$assign_add(learning_rate * d[[1]])
  b$assign_add(learning_rate * d[[2]])
  
  current_loss = L
  
  cat(glue::glue("Epoch: {epoch}, Loss: {as.numeric(current_loss)}"), "\n")

  losses[epoch] <- as.numeric(current_loss)
    
}
Epoch: 1, Loss: -64.884168727686 
Epoch: 2, Loss: -52.0970964369214 
Epoch: 3, Loss: -40.7580844729695 
Epoch: 4, Loss: -39.6961741609297 
Epoch: 5, Loss: -39.6138575017734 
Epoch: 6, Loss: -39.5851677656405 
Epoch: 7, Loss: -39.5785648703051 
Epoch: 8, Loss: -39.5763458015593 
Epoch: 9, Loss: -39.5757237923894 
Epoch: 10, Loss: -39.5755299326785 
Epoch: 11, Loss: -39.5754728446656 
Epoch: 12, Loss: -39.5754554963127 
Epoch: 13, Loss: -39.5754503139949 
Epoch: 14, Loss: -39.5754487512676 
Epoch: 15, Loss: -39.5754482824529 
Epoch: 16, Loss: -39.5754481414108 
Epoch: 17, Loss: -39.5754480990443 
Epoch: 18, Loss: -39.5754480863074 
Epoch: 19, Loss: -39.5754480824799 
Epoch: 20, Loss: -39.5754480813295 

Visualize

library(ggplot2)
tibble(
  epoch = 1:20,
  loss = losses
) %>% 
  ggplot(., aes(x = epoch)) +
  geom_line(aes(y=loss))

Learning populations

df <- tibble(
  y_true = y,
  y_pred = predict.sigmoid(X, W, b) %>% as.matrix
) %>% 
  mutate(y_pred_class = if_else(condition = y_pred > 0.5, true = 1, false = 0))

df %>% 
  ggplot(.,) + 
  geom_density(aes(x=y_true, fill="Target"), alpha = 0.5) +
  geom_density(aes(x=y_pred, fill="Prediction"), alpha = 0.5) +
  labs(fill = "Response") +
  ggtitle("Target vs. Prediction Distribution") +
  theme_classic()

It looks like training in this way the Logistic Classifier was able to discover the population parameter for our underlying response distribution.

\[ \hat p =0.25 \]

Predictions

But how well did it predict on the training data? Notice the over prediction on the class 0 and the under prediction on the class 1. This is an infamous illustration of the class imbalance problem and how we learned to maximize a the majority, which was not the target class.

df %>% ggplot() + 
  geom_bar(aes(x=y_true, fill="Target"), alpha = 0.5) +
  geom_bar(aes(x=y_pred_class, fill = "Prediction"), alpha = 0.5) +
  labs(fill = "Response", x = "Class", y="Appearances") +
  ggtitle("Target vs. Prediction")

References

https://tensorflow.rstudio.com/tutorials/advanced/customization/custom-training/