SLGP with integer outputs

Athénaïs Gautier

This vignette serves as a quick guide to Spatial Logistic Gaussian Process (SLGP) modeling, with a focus on predicting distributions for integer outputs.

Dataset

We illustrate the model’s capabilities using the Boston Housing dataset @harrison_hedonic_1978, a widely used benchmark in statistical modeling and regression analysis.

In this vignette, we demonstrate how to model the distribution of rad (index of accessibility to radial highways, between 1 and 24 where 24 indicates the best accessibility and 1 the worst) as a function of medv (median value of owner-occupied homes) and dis (weighted distance to employment centers) using Spatial Logistic Gaussian Processes (SLGPs). Since rad is an integer-valued variable, our goal is to adjust the implementation for this.

library(dplyr)
# Load the dataset (available in MASS package)
require(MASS)
data("Boston", package = "MASS")
df <- Boston 
range_response <- range(df$rad)
range_x <- matrix(c(c(1, 13),
                    range(df$medv)), # Use c(1, 13) instead of range = c(1.1296 12.1265)
                  # For easier chunks
                  ncol=2, byrow=TRUE)

We represent the data.

library(ggplot2)
library(ggpubr)

# Histogram: Distribution of med by age bin
ggplot(df, aes(x = dis, y=medv, fill=as.factor(rad))) +
  geom_point(pch=21)+
  labs(x = "Median value of owner-occupied homes [k$]", 
       y = "weighted distance to employment centers", 
       title = "Scatter-plot of accessibility to radial highways") +
  theme_minimal()+
  scale_fill_viridis_d(option = "viridis",
                       guide = guide_legend(nrow = 3,
                                            title = "Index of accessibility",
                                            barheight = unit(2, units = "mm"),
                                            barwidth = unit(55, units = "mm"),
                                            title.position = 'top',
                                            label.position = "bottom",
                                            title.hjust = 0.5))+
  theme(legend.position = "bottom")
Figure 1: Index of Accessibility to Radial Highways as a Function of Home Value and Distance to Employment Centers

Figure 1: Index of Accessibility to Radial Highways as a Function of Home Value and Distance to Employment Centers

We propose mapping the value 24 to 9 for rad, resulting in a more compact domain

df$rad <- ifelse(df$rad==24, 9, df$rad) 
range_response <- range(df$rad)

# Histogram: Distribution of med by age bin
ggplot(df, aes(x = dis, y=medv, fill=factor(rad, levels=seq(9)))) +
  geom_point(pch=21)+
  labs(x = "Median value of owner-occupied homes [k$]", 
       y = "weighted distance to employment centers", 
       title = "Scatter-plot of accessibility to radial highways") +
  theme_minimal()+
  scale_fill_viridis_d(option = "viridis",
                       guide = guide_legend(nrow = 3,
                                            title = "Index of accessibility",
                                            barheight = unit(2, units = "mm"),
                                            barwidth = unit(55, units = "mm"),
                                            title.position = 'top',
                                            label.position = "bottom",
                                            title.hjust = 0.5))+
  theme(legend.position = "bottom")
Figure 2: Index of Accessibility to Radial Highways as a Function of Home Value and Distance to Employment Centers, with rescaled indices

Figure 2: Index of Accessibility to Radial Highways as a Function of Home Value and Distance to Employment Centers, with rescaled indices

We can also display the empirical probabilities. For that, we use “bins” for the samples, as there are no replicates.


# Create interval variables for medv and dis
df <- df %>%
  mutate(medv_bin = factor(paste0("medv in ",
                                  cut(medv, breaks = seq(0, 50, by = 10), 
                                      include.lowest = FALSE, right = TRUE)),
                           levels = paste0("medv in ", 
                                           cut(seq(50, 1, -10), 
                                               breaks = seq(0, 50, by = 10),
                                               include.lowest = FALSE, right = TRUE))),
         dis_bin = factor(paste0("dis in ",
                                 cut(dis, breaks = seq(1, 13, by = 2), 
                                     include.lowest = FALSE, right = TRUE)),
                          levels = paste0("dis in ", 
                                          cut(seq(2, 13, 2), 
                                              breaks = seq(1, 13, by = 2),
                                              include.lowest = FALSE, right = TRUE))))

ggplot(df, aes(x = rad, y = after_stat(prop))) +
  geom_bar(fill="cornflowerblue", col="navy", lwd=0.2, alpha=0.7, 
           width=0.4)+
  facet_grid(medv_bin ~ dis_bin) +
  labs(x = "Index of accessibility to radial highways (RAD)", 
       y = "Proportion", 
       title = "Distribution of RAD across bins of various home value and distance to employment centers") +
  theme_minimal() +
  theme(legend.position = "bottom")+
  scale_x_continuous(breaks = c(1:9)) 
Figure 3: Distribution of RAD across bins of various home value and distance to employment centers

Figure 3: Distribution of RAD across bins of various home value and distance to employment centers

SLGP model specifications

Maximum a posteriori estimate

library(SLGP)

modelMAP <- slgp(rad~dis+medv, # Use a formula with two indexing variables
                 data=df,
                 method="MAP", #Maximum a posteriori estimation scheme
                 basisFunctionsUsed = "RFF",
                 interpolateBasisFun="WNN", # Accelerate inference
                 hyperparams = list(lengthscale=c(0.1, 0.15, 0.15), 
                                    # Applied to normalised data
                                    # So 0.15 is 15% of the range of values
                                    sigma2=1), 
                 nIntegral = 9, #or length(seq(seq(range_response[1], range_response[2])))
                 sigmaEstimationMethod = "heuristic", # Set to heuristic for numerical stability                 
                 predictorsLower= c(range_x[,1]),
                 predictorsUpper= c(range_x[,2]),
                 responseRange= range_response,
                 opts_BasisFun = list(nFreq=150,
                                      MatParam=5/2))

We can represent the conditional probabilities. For that, we use “bins” for the samples, as there are no replicates. We display the histograms of values in these bins compared to SLGP predictions of the probabilities at the center of the bins.

library(viridis)
dfGrid <- data.frame(expand.grid(seq(range_x[1, 1], range_x[1, 2], 0.5), 
                                 seq(range_x[2, 1], range_x[2, 2], 1), 
                                 seq(range_response[1], range_response[2])))
colnames(dfGrid) <- c("dis", "medv", "rad")
pred <- predictSLGP_newNode(SLGPmodel=modelMAP,
                            newNodes = dfGrid,
                            nIntegral = 9)
pred[, -c(1:3)] <- pred[, -c(1:3)] * diff(range_response) /(diff(range_response) +1) 
# Goes from values to integrate over a domain to discrete probabilities
df_plot <- pred %>%
  filter(dis %in% seq(2, 12, 2))%>%
  filter(medv %in% seq(5, 45, 10)) %>%
  mutate(medv_bin=paste0("medv in (", medv-5,",", medv+5,"]"))%>%
  mutate(dis_bin=paste0("dis in (", dis-1,",", dis+1,"]"))
df_counts <- df %>%
  group_by(medv_bin, dis_bin) %>%
  summarise(count = paste0("n=", n()), .groups = "keep")

ggplot() +
  geom_bar(data=df, mapping=aes(x = rad-0.1, y = after_stat(prop)),
           fill="cornflowerblue", col="navy", lwd=0.2, alpha=0.7, 
           width=0.4) +
  geom_col(data=df_plot, mapping=aes(x = rad+0.1, y = pdf_1),
           col="red", fill="grey", lwd=0.2, alpha=0.7, lty=2, 
           width=0.4) +
  geom_text(data=df_counts, mapping=aes(x = 1.5, y = 1, label=count), 
            col="grey10", size = 3, vjust = 1)+ 
  facet_grid(medv_bin ~ dis_bin) +
  labs(x = "Index of accessibility to radial highways (RAD)", 
       y = "Proportion", 
       title = "Distribution of RAD across bins of various medv and dis values (blue histogram)\nVS SLGP at the center of these bins") +
  theme_minimal() +
  theme(legend.position = "bottom")+
  scale_x_continuous(breaks = c(1:9)) 
Figure 2: Predictive probabilities of rad at medv and dis, as predicted by a SLGP.

Figure 2: Predictive probabilities of rad at medv and dis, as predicted by a SLGP.