In this post we will explore how to do deep regression in R using Keras.
The objective of this post is to walk through doing regression on tabular data in keras. There are an abundance of tutorials to explore the mechanics of this, our goal is to open the mind with examples.
boston_housing <- dataset_boston_housing()
train_data <- boston_housing$train$x
train_labels <- boston_housing$train$y
test_data <- boston_housing$test$x
test_labels <- boston_housing$test$y
paste0("Training entries: ", length(train_data), ", labels: ", length(train_labels))
[1] "Training entries: 5252, labels: 404"
column_names <- c('CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT')
train_df <- train_data %>%
dplyr::as_tibble(.name_repair = "minimal") %>%
stats::setNames(., column_names) %>%
dplyr::mutate(label = train_labels)
train_df
# A tibble: 404 x 14
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1.23 0 8.14 0 0.538 6.14 91.7 3.98 4 307
2 0.0218 82.5 2.03 0 0.415 7.61 15.7 6.27 2 348
3 4.90 0 18.1 0 0.631 4.97 100 1.33 24 666
4 0.0396 0 5.19 0 0.515 6.04 34.5 5.99 5 224
5 3.69 0 18.1 0 0.713 6.38 88.4 2.57 24 666
6 0.284 0 7.38 0 0.493 5.71 74.3 4.72 5 287
7 9.19 0 18.1 0 0.7 5.54 100 1.58 24 666
8 4.10 0 19.6 0 0.871 5.47 100 1.41 5 403
9 2.16 0 19.6 0 0.871 5.63 100 1.52 5 403
10 1.63 0 21.9 0 0.624 5.02 100 1.44 4 437
# ... with 394 more rows, and 4 more variables: PTRATIO <dbl>,
# B <dbl>, LSTAT <dbl>, label <dbl>
test_df <- test_data %>%
dplyr::as_tibble(.name_repair = "minimal") %>%
stats::setNames(., column_names) %>%
dplyr::mutate(label = test_labels)
test_df
# A tibble: 102 x 14
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 18.1 0 18.1 0 0.679 6.43 100 1.83 24 666
2 0.123 0 10.0 0 0.547 5.91 92.9 2.35 6 432
3 0.0550 0 5.19 0 0.515 5.98 45.4 4.81 5 224
4 1.27 0 19.6 1 0.605 6.25 92.6 1.80 5 403
5 0.0715 0 4.49 0 0.449 6.12 56.8 3.75 3 247
6 0.280 0 9.69 0 0.585 5.93 42.6 2.38 6 391
7 0.0305 55 3.78 0 0.484 6.87 28.1 6.47 5 370
8 0.0355 25 4.86 0 0.426 6.17 46.7 5.40 4 281
9 0.0930 0 25.6 0 0.581 5.96 92.9 2.09 2 188
10 3.57 0 18.1 0 0.58 6.44 75 2.90 24 666
# ... with 92 more rows, and 4 more variables: PTRATIO <dbl>,
# B <dbl>, LSTAT <dbl>, label <dbl>
train_labels[1:10]
[1] 15.2 42.3 50.0 21.1 17.7 18.5 11.3 15.6 15.6 14.4
# See what features are correlated
# Center and normalize
Xtrain <- train_df %>% select(-label)
Xs <- scale(Xtrain, center = TRUE, scale = TRUE)
# Tidy the data
C <- cor(Xs) %>% as.data.frame
C$from <- row.names(C)
C <- C %>% tidyr::gather(., key = "to", value = "correlation", -from)
# Present a heatmap
plt <- ggplot(data = C) +
geom_tile(aes(x=to, y=from, fill=correlation)) +
scale_fill_gradient(low = "red", high = "green") +
ggtitle("Correlation between Boston House features") +
theme(axis.text.x = element_text(angle = 90))
plt

A few areas of concern are those with greater than about 0.5 correlation. This could mean they are excessive information and redundant. Let’s look at those who have a magnitude greater than between 0.25 and 0.5.
Below is an analysis of some of the correlations. I performed a detailed analysis on a pre-centered correlation matrix, which I will leave below for kicks. But ultimately, high correlated features should be considered for removal from the model.
# Present a heatmap
plt <- ggplot(data = C %>% filter(abs(correlation) > 0.7)) +
geom_tile(aes(x=to, y=from, fill=correlation)) +
scale_fill_gradient(low = "red", high = "green") +
ggtitle("Correlations between Boston House features") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
plt

There are several columns that could be dropped due to their high correlation with other variables. Let’s keep track of these for a hot minute. We will drop these and remodel later to see if this helps any.
EXCESSIVE_FEATURES <- c("DIS", "NOX", "TAX")
EXCESSIVE_FEATURES
[1] "DIS" "NOX" "TAX"
# Center and normalize
Xtrain <- train_df %>% select(-label)
Xs_train <- scale(Xtrain, center = T, scale = T)
train_df_scaled <- cbind(Xs_train, train_df %>% select(label))
Xtest <- test_df %>% select(-label)
Xs_test <- scale(Xtest, center = T, scale = T)
test_df_scaled <- cbind(Xs_test, test_df %>% select(label))
Some cheat codes are required to make keras work with
dataframes. First, you have to make a multi-head model. Define an
input with your dimensionality of the data frame. Second,
create your other layers to pass it through. Merge them using the
keras_model function. R is great, but can be very painful
at times. This was not pleasant. Further, only feed in
matrices. Don’t even fight with throwing in a data frame. This
API isn’t smart like that.
# Input head
input <- keras::layer_input(shape = c(13))
# Output head
output <- input %>%
keras::layer_batch_normalization() %>%
keras::layer_dense(units = 1)
# Join input and output heads
model <- keras::keras_model(input, output)
model %>% keras::compile(
loss = "mse",
optimizer = "adam",
metrics = list("mean_absolute_error")
)
history = model %>% keras::fit(
x = train_df %>% select(-label) %>% as.matrix,
y = train_df$label %>% as.matrix,
validation_split = 0.1,
epochs = 100,
verbose=0
)
plot(history)

loss mean_absolute_error
29.055779 3.862492
ypred <- predict(model,
x = test_df %>% select(-label) %>% as.matrix)
tmp <- data.frame(id = 1:length(ypred),
prediction = ypred,
target = test_df$label)
tmp <- tmp %>% dplyr::mutate(mse = sqrt((tmp$prediction - tmp$target)**2),
error = tmp$prediction - tmp$target)
plt <- ggplot( data = tmp) +
geom_point(aes(x=id, y=prediction, color="prediction")) +
geom_point(aes(x=id, y=target, color="target")) +
theme_classic()
ggplot(tmp) +
geom_histogram(aes(x=mse, fill="mse"), alpha = 0.4) +
geom_histogram(aes(x=error, fill="error"), alpha=0.4) +
labs(fill = "Error Distribution") +
ggtitle("Test Error") +
xlab("Error") +
ylab("Occurances") +
theme_light()
shapiro.test(tmp$error) # This is probably normal. We can do inferential statistics on the population!
Shapiro-Wilk normality test
data: tmp$error
W = 0.92301, p-value = 1.706e-05
In efforts to save time and effort. The experiment to drop
correlation variables EXCESSIVE FEATURES turned out to
actually make the model worse. This is probably due to the fact that
they store valuable information in this relatively small data set.
Since we built our regression model with just one layer, the weights in that layer are in fact our parameters
output_layer <- model$layers[[3]]
weights <- output_layer$get_weights()
theta <- weights[[1]]
bias <- weights[[2]]
theta
[,1]
[1,] -1.101465
[2,] -1.401569
[3,] 1.574454
[4,] -1.463762
[5,] 1.619609
[6,] 1.167189
[7,] -1.666827
[8,] 1.558088
[9,] -1.257356
[10,] -1.094540
[11,] 1.392148
[12,] 1.659781
[13,] -1.575807
est_df <- data.frame(variable = c(names(train_df %>% select(-label)), "intercept"),
estimate = c(theta, bias)
)
ggplot(data = est_df) +
geom_col(aes(x=variable, y=estimate, fill=variable)) +
coord_flip() +
labs(fill = "Variable Estimate") +
ggtitle("Boston Housing Factors") +
ylab("Coefficient Estimate") +
xlab("Variable") +
theme_bw()

https://tensorflow.rstudio.com/tutorials/beginners/basic-ml/tutorial_basic_regression/