In this post we will explore automatic differentiation as applied to custom machine learning model generation using keras in R.
In the post on Automatic Differentiation with Keras in R
there was a walk through some of the documentation and an example of
using the GradientTape API. There is no creativity in this
post, just a repeat from the keras documentation. The goal
here is to take an example, play around with it, and explore the API a
bit.
library(tensorflow)
Model <- R6::R6Class(
classname = "Model",
public = list(
W = NULL,
b = NULL,
initialize = function(){
self$W <- tf$Variable(5) # Initialize weight to 5
self$b <- tf$Variable(0) # Initialize bias to 0
},
call = function(x){
self$W*x + self$b
}
)
)
model <- Model$new()
model$call(3)
tf.Tensor(15.0, shape=(), dtype=float32)
loss <- function(y_pred, y_true) {
tf$reduce_mean(tf$square(y_pred - y_true))
}
library(dplyr)
library(ggplot2)
df = tibble(
inputs = as.numeric(inputs),
outputs = as.numeric(outputs),
predicted = as.numeric(model$call(inputs))
)
df %>% ggplot(., aes(x=inputs)) +
geom_point(aes(y = outputs)) +
geom_line(aes(y = predicted), color = "blue")

library(glue)
model <- Model$new()
Ws <- bs <- c()
for(epoch in seq_len(20)){
Ws[epoch] <- as.numeric(model$W)
bs[epoch] <- as.numeric(model$b)
current_loss <- train(model, inputs, outputs, learning_rate = 0.1)
cat(glue::glue("Epoch: {epoch}, Loss: {as.numeric(current_loss)}"), "\n")
}
Epoch: 1, Loss: 9.27498435974121
Epoch: 2, Loss: 6.29132127761841
Epoch: 3, Loss: 4.38681507110596
Epoch: 4, Loss: 3.17101907730103
Epoch: 5, Loss: 2.39479780197144
Epoch: 6, Loss: 1.89916634559631
Epoch: 7, Loss: 1.58266150951385
Epoch: 8, Loss: 1.38052105903625
Epoch: 9, Loss: 1.25140619277954
Epoch: 10, Loss: 1.16892516613007
Epoch: 11, Loss: 1.11622834205627
Epoch: 12, Loss: 1.08255589008331
Epoch: 13, Loss: 1.0610374212265
Epoch: 14, Loss: 1.04728376865387
Epoch: 15, Loss: 1.0384920835495
Epoch: 16, Loss: 1.03287136554718
Epoch: 17, Loss: 1.02927708625793
Epoch: 18, Loss: 1.0269787311554
Epoch: 19, Loss: 1.02550864219666
Epoch: 20, Loss: 1.02456820011139
library(tidyr)
# Variables per column
df = tibble(
epoch = 1:20,
Ws = Ws,
bs = bs
)
# Melt on multiple columns! Name their similarity..
df = df %>% tidyr::pivot_longer(c(Ws, bs),
names_to = "parameter",
values_to = "estimate")
ggplot(df, aes(x=epoch, y=estimate)) +
geom_line() +
facet_wrap(~parameter, scales = "free")

Now let’s build a custom logistic regression model.
Notice the data is collected from the following distributions:
Independent Variables \[ X \sim \mathcal{N}(\mu=0,\,\sigma^{2}=1)\ \] Dependent Variables \[ y \sim \mathcal{B}(n=1, p=0.25)\ \] ### Build the model
library(tensorflow)
library(dplyr)
Model <- R6::R6Class(
classname = "Model",
public = list(
W = NULL,
b = NULL,
initialize = function(D){
# Manually runs great, but won't knit..
self$W <- tf$Variable(matrix(rnorm(n = D), nrow=1, ncol=D))
self$b <- tf$Variable(matrix(rnorm(n = 1), nrow=1, ncol=1))
},
call = function(x){
1/(1 + exp(-(tf$matmul(x, t(self$W)) + self$b)))
}
)
)
model <- Model$new(ncol(X))
model$call(X) %>% head
tf.Tensor(
[[0.90854418]
[0.94750512]
[0.72264755]
[0.69501559]
[0.97674013]
[0.85342092]], shape=(6, 1), dtype=float64)
\[ L(X; W, b) = \prod_{i=1}^n{\hat y_{i}^{y_{i}} (1-\hat y_{i})^{1-y_{i}} } \]
# Generate data
df <- data.frame(matrix(rnorm(1000), nrow=100, ncol=2))
df$Y <- rbinom(n = 100, size = 1, prob = 0.25)
X <- df %>%
select(-Y)
y <- df %>% select(Y) %>%
dplyr::pull(Y) %>%
as.matrix
D = ncol(X)
# Create a fresh model
learning_rate = 0.1
W = tf$Variable(matrix(rnorm(D), nrow=1, ncol=D))
b = tf$Variable(matrix(rnorm(1), nrow=1, ncol=1))
sigmoid = function(z){
1/(1+exp(-z))
}
predict.sigmoid = function(x, W, b){
y_pred = sigmoid(tf$matmul(X, t(W)) + b)
y_pred
}
losses <- c()
for(epoch in seq_len(20)){
with (tf$GradientTape() %as% t, {
t$watch(W)
t$watch(b)
y_pred = predict.sigmoid(X, W, b)
L = log_likelihood(y_pred, y)
})
d <- t$gradient(L, list(W, b))
# Gradient ascent
W$assign_add(learning_rate * d[[1]])
b$assign_add(learning_rate * d[[2]])
current_loss = L
cat(glue::glue("Epoch: {epoch}, Loss: {as.numeric(current_loss)}"), "\n")
losses[epoch] <- as.numeric(current_loss)
}
Epoch: 1, Loss: -64.884168727686
Epoch: 2, Loss: -52.0970964369214
Epoch: 3, Loss: -40.7580844729695
Epoch: 4, Loss: -39.6961741609297
Epoch: 5, Loss: -39.6138575017734
Epoch: 6, Loss: -39.5851677656405
Epoch: 7, Loss: -39.5785648703051
Epoch: 8, Loss: -39.5763458015593
Epoch: 9, Loss: -39.5757237923894
Epoch: 10, Loss: -39.5755299326785
Epoch: 11, Loss: -39.5754728446656
Epoch: 12, Loss: -39.5754554963127
Epoch: 13, Loss: -39.5754503139949
Epoch: 14, Loss: -39.5754487512676
Epoch: 15, Loss: -39.5754482824529
Epoch: 16, Loss: -39.5754481414108
Epoch: 17, Loss: -39.5754480990443
Epoch: 18, Loss: -39.5754480863074
Epoch: 19, Loss: -39.5754480824799
Epoch: 20, Loss: -39.5754480813295
library(ggplot2)
tibble(
epoch = 1:20,
loss = losses
) %>%
ggplot(., aes(x = epoch)) +
geom_line(aes(y=loss))

df <- tibble(
y_true = y,
y_pred = predict.sigmoid(X, W, b) %>% as.matrix
) %>%
mutate(y_pred_class = if_else(condition = y_pred > 0.5, true = 1, false = 0))
df %>%
ggplot(.,) +
geom_density(aes(x=y_true, fill="Target"), alpha = 0.5) +
geom_density(aes(x=y_pred, fill="Prediction"), alpha = 0.5) +
labs(fill = "Response") +
ggtitle("Target vs. Prediction Distribution") +
theme_classic()

It looks like training in this way the
Logistic Classifier was able to discover the population
parameter for our underlying response distribution.
\[ \hat p =0.25 \]
But how well did it predict on the training data? Notice the
over prediction on the class 0 and the
under prediction on the class 1. This is an infamous
illustration of the class imbalance problem and how we
learned to maximize a the majority, which was not the target class.
df %>% ggplot() +
geom_bar(aes(x=y_true, fill="Target"), alpha = 0.5) +
geom_bar(aes(x=y_pred_class, fill = "Prediction"), alpha = 0.5) +
labs(fill = "Response", x = "Class", y="Appearances") +
ggtitle("Target vs. Prediction")

https://tensorflow.rstudio.com/tutorials/advanced/customization/custom-training/