Boston Housing Prices

The objective of this post is to walk through doing regression on tabular data in keras. There are an abundance of tutorials to explore the mechanics of this, our goal is to open the mind with examples.

library(tfdatasets)
library(keras)
library(tidyr)
library(dplyr)
library(ggplot2)

boston_housing <- dataset_boston_housing()
train_data <- boston_housing$train$x
train_labels <- boston_housing$train$y

test_data <- boston_housing$test$x
test_labels <- boston_housing$test$y

paste0("Training entries: ", length(train_data), ", labels: ", length(train_labels))

[1] "Training entries: 5252, labels: 404"

column_names <- c('CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 
                  'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT')

train_df <- train_data %>% 
  dplyr::as_tibble(.name_repair = "minimal") %>% 
  stats::setNames(., column_names) %>% 
  dplyr::mutate(label = train_labels)
train_df

# A tibble: 404 x 14
     CRIM    ZN INDUS  CHAS   NOX    RM   AGE   DIS   RAD   TAX
    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 1.23     0    8.14     0 0.538  6.14  91.7  3.98     4   307
 2 0.0218  82.5  2.03     0 0.415  7.61  15.7  6.27     2   348
 3 4.90     0   18.1      0 0.631  4.97 100    1.33    24   666
 4 0.0396   0    5.19     0 0.515  6.04  34.5  5.99     5   224
 5 3.69     0   18.1      0 0.713  6.38  88.4  2.57    24   666
 6 0.284    0    7.38     0 0.493  5.71  74.3  4.72     5   287
 7 9.19     0   18.1      0 0.7    5.54 100    1.58    24   666
 8 4.10     0   19.6      0 0.871  5.47 100    1.41     5   403
 9 2.16     0   19.6      0 0.871  5.63 100    1.52     5   403
10 1.63     0   21.9      0 0.624  5.02 100    1.44     4   437
# ... with 394 more rows, and 4 more variables: PTRATIO <dbl>,
#   B <dbl>, LSTAT <dbl>, label <dbl>

test_df <- test_data %>% 
  dplyr::as_tibble(.name_repair = "minimal") %>% 
  stats::setNames(., column_names) %>% 
  dplyr::mutate(label = test_labels)
test_df

# A tibble: 102 x 14
      CRIM    ZN INDUS  CHAS   NOX    RM   AGE   DIS   RAD   TAX
     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 18.1        0 18.1      0 0.679  6.43 100    1.83    24   666
 2  0.123      0 10.0      0 0.547  5.91  92.9  2.35     6   432
 3  0.0550     0  5.19     0 0.515  5.98  45.4  4.81     5   224
 4  1.27       0 19.6      1 0.605  6.25  92.6  1.80     5   403
 5  0.0715     0  4.49     0 0.449  6.12  56.8  3.75     3   247
 6  0.280      0  9.69     0 0.585  5.93  42.6  2.38     6   391
 7  0.0305    55  3.78     0 0.484  6.87  28.1  6.47     5   370
 8  0.0355    25  4.86     0 0.426  6.17  46.7  5.40     4   281
 9  0.0930     0 25.6      0 0.581  5.96  92.9  2.09     2   188
10  3.57       0 18.1      0 0.58   6.44  75    2.90    24   666
# ... with 92 more rows, and 4 more variables: PTRATIO <dbl>,
#   B <dbl>, LSTAT <dbl>, label <dbl>

train_labels[1:10]

 [1] 15.2 42.3 50.0 21.1 17.7 18.5 11.3 15.6 15.6 14.4

Visualize Correlation

# See what features are correlated
# Center and normalize
Xtrain <- train_df %>% select(-label)
Xs <- scale(Xtrain, center = TRUE, scale = TRUE)


# Tidy the data
C <- cor(Xs) %>% as.data.frame
C$from <- row.names(C)
C <- C %>% tidyr::gather(., key = "to", value = "correlation", -from)

# Present a heatmap
plt <- ggplot(data = C) + 
  geom_tile(aes(x=to, y=from, fill=correlation)) + 
  scale_fill_gradient(low = "red", high = "green") +
  ggtitle("Correlation between Boston House features") +
  theme(axis.text.x = element_text(angle = 90)) 

plt

A few areas of concern are those with greater than about 0.5 correlation. This could mean they are excessive information and redundant. Let’s look at those who have a magnitude greater than between 0.25 and 0.5.

Visualize Serious Correlation

Below is an analysis of some of the correlations. I performed a detailed analysis on a pre-centered correlation matrix, which I will leave below for kicks. But ultimately, high correlated features should be considered for removal from the model.

# Present a heatmap
plt <- ggplot(data = C %>% filter(abs(correlation) > 0.7)) + 
  geom_tile(aes(x=to, y=from, fill=correlation)) + 
  scale_fill_gradient(low = "red", high = "green") +
  ggtitle("Correlations between Boston House features") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90))

plt

There are several columns that could be dropped due to their high correlation with other variables. Let’s keep track of these for a hot minute. We will drop these and remodel later to see if this helps any.

EXCESSIVE_FEATURES <- c("DIS", "NOX", "TAX")
EXCESSIVE_FEATURES

[1] "DIS" "NOX" "TAX"

Standardize the data

# Center and normalize
Xtrain <- train_df %>% select(-label)
Xs_train <- scale(Xtrain, center = T, scale = T)
train_df_scaled <- cbind(Xs_train, train_df %>% select(label))

Xtest <- test_df %>% select(-label)
Xs_test <- scale(Xtest, center = T, scale = T)
test_df_scaled <- cbind(Xs_test, test_df %>% select(label))

Model

Some cheat codes are required to make keras work with dataframes. First, you have to make a multi-head model. Define an input with your dimensionality of the data frame. Second, create your other layers to pass it through. Merge them using the keras_model function. R is great, but can be very painful at times. This was not pleasant. Further, only feed in matrices. Don’t even fight with throwing in a data frame. This API isn’t smart like that.

# Input head
input <- keras::layer_input(shape = c(13))

# Output head
output <- input %>% 
  keras::layer_batch_normalization() %>%  
  keras::layer_dense(units = 1)

# Join input and output heads
model <- keras::keras_model(input, output)

model %>% keras::compile(
  loss = "mse",
  optimizer = "adam",
  metrics = list("mean_absolute_error")
)

history = model %>% keras::fit(
  x = train_df %>% select(-label) %>% as.matrix,
  y = train_df$label %>% as.matrix,
  validation_split = 0.1,
  epochs = 100,
  verbose=0
)


plot(history)

Evaluate

evaluate(model,
         x = test_df %>% select(-label) %>% as.matrix, 
         y = test_df$label %>% as.matrix)

               loss mean_absolute_error 
          29.055779            3.862492

Visualize

ypred <- predict(model, 
                x = test_df %>% select(-label) %>% as.matrix)

tmp <- data.frame(id = 1:length(ypred),
                  prediction = ypred,
                  target = test_df$label)
tmp <- tmp %>% dplyr::mutate(mse = sqrt((tmp$prediction - tmp$target)**2),
                             error = tmp$prediction - tmp$target)

plt <- ggplot( data = tmp) + 
  geom_point(aes(x=id, y=prediction, color="prediction")) +
  geom_point(aes(x=id, y=target, color="target")) +
  theme_classic()

ggplot(tmp) + 
  geom_histogram(aes(x=mse, fill="mse"), alpha = 0.4) +
  geom_histogram(aes(x=error, fill="error"), alpha=0.4) + 
  labs(fill = "Error Distribution") +
  ggtitle("Test Error") +
  xlab("Error") +
  ylab("Occurances") +
  theme_light()

shapiro.test(tmp$error) # This is probably normal. We can do inferential statistics on the population!


    Shapiro-Wilk normality test

data:  tmp$error
W = 0.92301, p-value = 1.706e-05

Dropping Correlated Features

In efforts to save time and effort. The experiment to drop correlation variables EXCESSIVE FEATURES turned out to actually make the model worse. This is probably due to the fact that they store valuable information in this relatively small data set.

Inferentials

Since we built our regression model with just one layer, the weights in that layer are in fact our parameters

output_layer <- model$layers[[3]]

weights <- output_layer$get_weights()
theta <- weights[[1]]
bias <- weights[[2]]
theta

           [,1]
 [1,] -1.101465
 [2,] -1.401569
 [3,]  1.574454
 [4,] -1.463762
 [5,]  1.619609
 [6,]  1.167189
 [7,] -1.666827
 [8,]  1.558088
 [9,] -1.257356
[10,] -1.094540
[11,]  1.392148
[12,]  1.659781
[13,] -1.575807

est_df <- data.frame(variable = c(names(train_df %>% select(-label)), "intercept"),
                     estimate = c(theta, bias)
          )


ggplot(data = est_df) + 
  geom_col(aes(x=variable, y=estimate, fill=variable)) + 
  coord_flip() +
  labs(fill = "Variable Estimate") +
  ggtitle("Boston Housing Factors") +
  ylab("Coefficient Estimate") +
  xlab("Variable") +
  theme_bw()

References

https://tensorflow.rstudio.com/tutorials/beginners/basic-ml/tutorial_basic_regression/

Regression with Keras in R