Bayesian Surprise with Canadian Census Data (cancensus)

Introduction

This vignette demonstrates how to use the bayesiansurpriser package with Canadian Census data accessed through the cancensus package. The cancensus package provides access to Canadian Census data from 1996 through 2021 via the CensusMapper API.

Canadian Census data is a useful setting for Bayesian Surprise analysis because it provides:

As with ACS examples, Census profile counts are best treated carefully. The surprise maps below model rates directly using distributional models. Funnel plots are used separately as diagnostics for how population or household counts affect the reliability of rates.

Prerequisites

You’ll need a CensusMapper API key from https://censusmapper.ca/users/sign_up

library(bayesiansurpriser)
library(cancensus)
library(sf)
library(ggplot2)
library(dplyr)

# Set your CensusMapper API key (do this once)
# set_cancensus_api_key("YOUR_KEY_HERE", install = TRUE)

# Optionally set a cache path for faster subsequent queries
# set_cancensus_cache_path("path/to/cache", install = TRUE)

Understanding cancensus Data Structure

Before diving into analysis, let’s understand the available datasets and variables.

# List available Census datasets
datasets <- list_census_datasets()
print(datasets %>% select(dataset, description))
# List regions for the 2021 Census
regions_2021 <- list_census_regions("CA21")
head(regions_2021 %>% filter(level == "CMA"))

Example 1: Low Income Rates by Census Division

Let’s examine low income rates across Canada’s Census Divisions and identify which regions have atypical rates relative to the cross-division distribution.

Fetch Census Data

# Find relevant vectors for low income
# v_CA21_1085: Total - Low-income status in 2020
# v_CA21_1086: 0 to 17 years in low income
# v_CA21_1090: In low income

# Get low income data at the Census Division level for all of Canada
# Using labels = "short" for cleaner column names
income_data <- get_census(
  dataset = "CA21",
  regions = list(C = "01"),  # All of Canada
  vectors = c(
    total_pop = "v_CA21_1085",  # Total population for low income status
    low_income = "v_CA21_1090"   # Population in low income
  ),
  level = "CD",
  geo_format = "sf",
  labels = "short",
  quiet = TRUE
)

# Clean and filter
income_data <- income_data %>%
  filter(!is.na(total_pop) & !is.na(low_income)) %>%
  filter(total_pop > 0) %>%
  mutate(
    low_income_rate = low_income / total_pop
  )

head(income_data %>% st_drop_geometry() %>% select(GeoUID, `Region Name`, total_pop, low_income, low_income_rate))

Compute Bayesian Surprise

# Compute surprise on the low-income-rate distribution.
income_surprise <- surprise(
  income_data,
  observed = low_income_rate,
  models = c("uniform", "gaussian", "sampled")
)

print(income_surprise)
summary(income_surprise)

Visualize Surprising Regions

# Add surprise values to data
income_data$surprise <- get_surprise(income_surprise, "surprise")
income_data$signed_surprise <- get_surprise(income_surprise, "signed")

# Transform to Statistics Canada Lambert projection for better Canada visualization
income_data_lambert <- st_transform(income_data, "EPSG:3347")

# Plot Canada-wide
ggplot(income_data_lambert) +
  geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.1) +
  scale_fill_surprise_diverging(name = "Signed\nsurprise") +
  labs(
    title = "Atypical Low Income Rates by Census Division",
    subtitle = "Signed surprise from the cross-division low-income-rate distribution",
    caption = "Source: Statistics Canada, 2021 Census"
  ) +
  theme_void() +
  theme(
    legend.position = "right",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  )

Identify Most Surprising Regions

# Top regions with positive signed surprise
high_income_surprise <- income_data %>%
  st_drop_geometry() %>%
  filter(signed_surprise > 0) %>%
  arrange(desc(surprise)) %>%
  select(`Region Name`, low_income_rate, total_pop, surprise) %>%
  head(10)

cat("Census Divisions with positive signed surprise:\n")
print(high_income_surprise)

# Top regions with negative signed surprise
low_income_surprise <- income_data %>%
  st_drop_geometry() %>%
  filter(signed_surprise < 0) %>%
  arrange(desc(surprise)) %>%
  select(`Region Name`, low_income_rate, total_pop, surprise) %>%
  head(10)

cat("\nCensus Divisions with negative signed surprise:\n")
print(low_income_surprise)

Example 2: Housing Analysis in Metro Vancouver

Let’s analyze owner-occupied housing rates at the Census Tract level within the Vancouver CMA.

# Get housing tenure data for Vancouver CMA
# v_CA21_4237: Total private households by tenure
# v_CA21_4238: Owner households

housing_data <- get_census(
  dataset = "CA21",
  regions = list(CMA = "59933"),  # Vancouver CMA
  vectors = c(
    total_hh = "v_CA21_4237",  # Total households
    owner_hh = "v_CA21_4238"   # Owner households
  ),
  level = "CT",
  geo_format = "sf",
  labels = "short",
  quiet = TRUE
)

housing_data <- housing_data %>%
  filter(!is.na(total_hh) & !is.na(owner_hh)) %>%
  filter(total_hh > 50) %>%  # Filter small tracts
  mutate(
    owner_rate = owner_hh / total_hh
  )

cat("Number of Census Tracts:", nrow(housing_data), "\n")
# Compute surprise on the home-ownership-rate distribution.
housing_surprise <- surprise(
  housing_data,
  observed = owner_rate,
  models = c("uniform", "gaussian", "sampled")
)

housing_data$signed_surprise <- get_surprise(housing_surprise, "signed")

# Plot Vancouver
ggplot(housing_data) +
  geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.02) +
  scale_fill_surprise_diverging(name = "Signed\nsurprise") +
  labs(
    title = "Atypical Home Ownership Rates in Metro Vancouver",
    subtitle = "Signed surprise from the tract-level owner-rate distribution",
    caption = "Source: Statistics Canada, 2021 Census"
  ) +
  theme_void() +
  theme(
    legend.position = "right",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 9)
  )

Example 3: Language at Home Analysis

Identify areas with high model-conditional surprise for non-official language speakers at home.

# Get language data for Toronto CMA
# v_CA21_1144: Total - Knowledge of official languages
# v_CA21_1153: Neither English nor French

language_data <- get_census(
  dataset = "CA21",
  regions = list(CMA = "35535"),  # Toronto CMA
  vectors = c(
    total_pop = "v_CA21_1144",  # Total population
    no_official = "v_CA21_1153"   # Neither official language
  ),
  level = "CT",
  geo_format = "sf",
  labels = "short",
  quiet = TRUE
)

language_data <- language_data %>%
  filter(!is.na(total_pop) & !is.na(no_official)) %>%
  filter(total_pop > 100) %>%
  mutate(no_official_rate = no_official / total_pop)
# Compute surprise on the non-official-language-rate distribution.
lang_surprise <- surprise(
  language_data,
  observed = no_official_rate,
  models = c("uniform", "gaussian", "sampled")
)

language_data$signed_surprise <- get_surprise(lang_surprise, "signed")

# Plot Toronto
ggplot(language_data) +
  geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.02) +
  scale_fill_surprise_diverging(name = "Signed\nsurprise") +
  labs(
    title = "Atypical Non-Official Language Concentrations in Toronto",
    subtitle = "Signed surprise from the tract-level non-official-language-rate distribution",
    caption = "Source: Statistics Canada, 2021 Census"
  ) +
  theme_void() +
  theme(
    legend.position = "right",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 9)
  )

Example 4: Provincial Comparison with Funnel Plot

Compare provinces using a funnel plot to account for population size variation.

# Get provincial data
provincial_data <- get_census(
  dataset = "CA21",
  regions = list(C = "01"),
  vectors = c(
    total_pop = "v_CA21_1085",  # Total for low income
    low_income = "v_CA21_1090"   # In low income
  ),
  level = "PR",
  geo_format = NA,
  labels = "short",
  quiet = TRUE
)

provincial_data <- provincial_data %>%
  filter(!is.na(total_pop) & total_pop > 0)

# Compute funnel data for points
funnel_df <- compute_funnel_data(
  observed = provincial_data$low_income,
  sample_size = provincial_data$total_pop,
  type = "count",
  limits = c(2, 3)
)
funnel_df$province <- provincial_data$`Region Name`
funnel_df$rate <- funnel_df$observed / funnel_df$sample_size

# National rate
national_rate <- sum(provincial_data$low_income) / sum(provincial_data$total_pop)

# Create continuous funnel bands
n_range <- range(provincial_data$total_pop)
n_seq <- exp(seq(log(n_range[1] * 0.8), log(n_range[2] * 1.2), length.out = 200))
se_seq <- sqrt(national_rate * (1 - national_rate) / n_seq)

funnel_bands <- data.frame(
  n = n_seq,
  rate = national_rate,
  lower_2sd = national_rate - 2 * se_seq,
  upper_2sd = national_rate + 2 * se_seq,
  lower_3sd = national_rate - 3 * se_seq,
  upper_3sd = national_rate + 3 * se_seq
)

# Create funnel plot
ggplot() +
  # 3 SD band (outer)
  geom_ribbon(
    data = funnel_bands,
    aes(x = n, ymin = lower_3sd, ymax = upper_3sd),
    fill = "#9ecae1", alpha = 0.5
  ) +
  # 2 SD band (inner)
  geom_ribbon(
    data = funnel_bands,
    aes(x = n, ymin = lower_2sd, ymax = upper_2sd),
    fill = "#3182bd", alpha = 0.5
  ) +
  # National average line
  geom_hline(
    yintercept = national_rate,
    linetype = "dashed", color = "#636363", linewidth = 0.8
  ) +
  # Points
  geom_point(
    data = funnel_df,
    aes(x = sample_size, y = rate, color = abs(z_score) > 2),
    size = 4
  ) +
  # Labels
  ggrepel::geom_text_repel(
    data = funnel_df,
    aes(x = sample_size, y = rate, label = province),
    size = 3, fontface = "bold",
    max.overlaps = 15,
    segment.color = "gray60"
  ) +
  scale_color_manual(
    values = c("FALSE" = "#636363", "TRUE" = "#e6550d"),
    guide = "none"
  ) +
  scale_x_log10(
    labels = scales::comma,
    breaks = c(1e4, 1e5, 1e6, 1e7)
  ) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Funnel Plot: Provincial Low Income Rates",
    subtitle = "Provinces outside bands have surprising rates given population size",
    x = "Population (log scale)",
    y = "Low Income Rate",
    caption = "Source: Statistics Canada, 2021 Census"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  )

Example 5: Comparing Census Years

Compare surprise patterns between 2016 and 2021 Census at the provincial level.

# Get provincial data for 2016 and 2021
# Note: Variable IDs differ between Census years

# 2021 data
prov_2021 <- get_census(
  dataset = "CA21",
  regions = list(C = "01"),
  vectors = c(total_pop = "v_CA21_1085", low_income = "v_CA21_1090"),
  level = "PR",
  geo_format = NA,
  labels = "short",
  quiet = TRUE
)
prov_2021 <- prov_2021[!is.na(prov_2021$total_pop) & prov_2021$total_pop > 0, ]
prov_2021$low_income_rate <- prov_2021$low_income / prov_2021$total_pop
prov_2021$census_year <- 2021

# 2016 data (different vector IDs for same concept)
prov_2016 <- get_census(
  dataset = "CA16",
  regions = list(C = "01"),
  vectors = c(total_pop = "v_CA16_2540", low_income = "v_CA16_2555"),
  level = "PR",
  geo_format = NA,
  labels = "short",
  quiet = TRUE
)
prov_2016 <- prov_2016[!is.na(prov_2016$total_pop) & prov_2016$total_pop > 0, ]
prov_2016$low_income_rate <- prov_2016$low_income / prov_2016$total_pop
prov_2016$census_year <- 2016

# Compute surprise for each year's provincial low-income-rate distribution
surprise_2021 <- surprise(
  prov_2021,
  observed = low_income_rate,
  models = c("uniform", "gaussian", "sampled")
)
prov_2021$signed_surprise <- get_surprise(surprise_2021, "signed")

surprise_2016 <- surprise(
  prov_2016,
  observed = low_income_rate,
  models = c("uniform", "gaussian", "sampled")
)
prov_2016$signed_surprise <- get_surprise(surprise_2016, "signed")

# Compare changes
comparison <- merge(
  prov_2016[, c("GeoUID", "Region Name", "signed_surprise")],
  prov_2021[, c("GeoUID", "signed_surprise")],
  by = "GeoUID",
  suffixes = c("_2016", "_2021")
)
comparison$change <- comparison$signed_surprise_2021 - comparison$signed_surprise_2016
print(comparison[order(abs(comparison$change), decreasing = TRUE), ])

Note: This example is not evaluated because it requires multiple API calls. Run locally with your CensusMapper API key.


Example 6: Custom Model Space for Census Data

For Census data, create a custom model space that matches the quantity being modeled. Here the observed values are home-ownership rates, so the models are rate-distribution models rather than population-baseline count models.

# Using the Vancouver housing data
# Create a custom model space

# Create custom model space for rates
housing_models <- model_space(
  bs_model_gaussian(),
  bs_model_sampled()
)

print(housing_models)

# Use surprise() with custom model space
custom_result <- surprise(
  housing_data,
  observed = owner_rate,
  models = housing_models
)

housing_data$custom_surprise <- get_surprise(custom_result, "signed")

# Summary statistics
cat("Custom model surprise summary:\n")
summary(housing_data$custom_surprise)

Interpreting Surprise Values

Understanding what surprise values mean is crucial for proper interpretation:

Surprise Magnitude Interpretation Meaning
< 0.1 Trivial Little update from prior to posterior model probabilities
0.1 - 0.3 Minor Small model update
0.3 - 0.5 Moderate Noticeable model update
0.5 - 1.0 Substantial Strong model update
> 1.0 High Very strong model update under the chosen model space

Key insight: Surprise measures how much the data discriminates between models, not just how far a rate is from average. A region with an unusual rate may show low surprise if all models assign similar likelihood to it. A region with a rate near the average can still show surprise if it falls in a part of the distribution where the chosen models disagree. Use the funnel plot separately to understand whether population size makes a rate estimate unusually reliable or noisy.

# Categorize regions by surprise level
income_data <- income_data %>%
  mutate(
    surprise_category = cut(
      abs(signed_surprise),
      breaks = c(0, 0.1, 0.3, 0.5, 1.0, Inf),
      labels = c("Trivial", "Minor", "Moderate", "Substantial", "High"),
      include.lowest = TRUE
    )
  )

# Summary by category
cat("Census Divisions by surprise level:\n")
print(table(income_data$surprise_category))

Session Info

sessionInfo()
#> R version 4.5.0 (2025-04-11)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Sequoia 15.4.1
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
#> 
#> locale:
#> [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
#> 
#> time zone: America/Los_Angeles
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] dplyr_1.1.4             ggplot2_4.0.1           sf_1.0-23              
#> [4] cancensus_0.5.7         bayesiansurpriser_0.1.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.0     Rcpp_1.1.0        
#>  [5] tidyselect_1.2.1   jquerylib_0.1.4    scales_1.4.0       yaml_2.3.10       
#>  [9] fastmap_1.2.0      R6_2.6.1           generics_0.1.4     classInt_0.4-11   
#> [13] knitr_1.50         MASS_7.3-65        tibble_3.3.1       units_1.0-0       
#> [17] DBI_1.2.3          bslib_0.9.0        pillar_1.11.1      RColorBrewer_1.1-3
#> [21] rlang_1.1.7        cachem_1.1.0       xfun_0.52          sass_0.4.10       
#> [25] S7_0.2.1           cli_3.6.5          withr_3.0.2        magrittr_2.0.4    
#> [29] class_7.3-23       digest_0.6.39      grid_4.5.0         lifecycle_1.0.5   
#> [33] vctrs_0.7.0        KernSmooth_2.23-26 proxy_0.4-28       evaluate_1.0.4    
#> [37] glue_1.8.0         farver_2.1.2       e1071_1.7-17       rmarkdown_2.29    
#> [41] tools_4.5.0        pkgconfig_2.0.3    htmltools_0.5.8.1