This vignette demonstrates how to use the
bayesiansurpriser package with Canadian Census data
accessed through the cancensus package. The cancensus
package provides access to Canadian Census data from 1996 through 2021
via the CensusMapper API.
Canadian Census data is a useful setting for Bayesian Surprise analysis because it provides:
As with ACS examples, Census profile counts are best treated carefully. The surprise maps below model rates directly using distributional models. Funnel plots are used separately as diagnostics for how population or household counts affect the reliability of rates.
You’ll need a CensusMapper API key from https://censusmapper.ca/users/sign_up
library(bayesiansurpriser)
library(cancensus)
library(sf)
library(ggplot2)
library(dplyr)
# Set your CensusMapper API key (do this once)
# set_cancensus_api_key("YOUR_KEY_HERE", install = TRUE)
# Optionally set a cache path for faster subsequent queries
# set_cancensus_cache_path("path/to/cache", install = TRUE)Before diving into analysis, let’s understand the available datasets and variables.
# List available Census datasets
datasets <- list_census_datasets()
print(datasets %>% select(dataset, description))# List regions for the 2021 Census
regions_2021 <- list_census_regions("CA21")
head(regions_2021 %>% filter(level == "CMA"))Let’s examine low income rates across Canada’s Census Divisions and identify which regions have atypical rates relative to the cross-division distribution.
# Find relevant vectors for low income
# v_CA21_1085: Total - Low-income status in 2020
# v_CA21_1086: 0 to 17 years in low income
# v_CA21_1090: In low income
# Get low income data at the Census Division level for all of Canada
# Using labels = "short" for cleaner column names
income_data <- get_census(
dataset = "CA21",
regions = list(C = "01"), # All of Canada
vectors = c(
total_pop = "v_CA21_1085", # Total population for low income status
low_income = "v_CA21_1090" # Population in low income
),
level = "CD",
geo_format = "sf",
labels = "short",
quiet = TRUE
)
# Clean and filter
income_data <- income_data %>%
filter(!is.na(total_pop) & !is.na(low_income)) %>%
filter(total_pop > 0) %>%
mutate(
low_income_rate = low_income / total_pop
)
head(income_data %>% st_drop_geometry() %>% select(GeoUID, `Region Name`, total_pop, low_income, low_income_rate))# Add surprise values to data
income_data$surprise <- get_surprise(income_surprise, "surprise")
income_data$signed_surprise <- get_surprise(income_surprise, "signed")
# Transform to Statistics Canada Lambert projection for better Canada visualization
income_data_lambert <- st_transform(income_data, "EPSG:3347")
# Plot Canada-wide
ggplot(income_data_lambert) +
geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.1) +
scale_fill_surprise_diverging(name = "Signed\nsurprise") +
labs(
title = "Atypical Low Income Rates by Census Division",
subtitle = "Signed surprise from the cross-division low-income-rate distribution",
caption = "Source: Statistics Canada, 2021 Census"
) +
theme_void() +
theme(
legend.position = "right",
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)
)# Top regions with positive signed surprise
high_income_surprise <- income_data %>%
st_drop_geometry() %>%
filter(signed_surprise > 0) %>%
arrange(desc(surprise)) %>%
select(`Region Name`, low_income_rate, total_pop, surprise) %>%
head(10)
cat("Census Divisions with positive signed surprise:\n")
print(high_income_surprise)
# Top regions with negative signed surprise
low_income_surprise <- income_data %>%
st_drop_geometry() %>%
filter(signed_surprise < 0) %>%
arrange(desc(surprise)) %>%
select(`Region Name`, low_income_rate, total_pop, surprise) %>%
head(10)
cat("\nCensus Divisions with negative signed surprise:\n")
print(low_income_surprise)Let’s analyze owner-occupied housing rates at the Census Tract level within the Vancouver CMA.
# Get housing tenure data for Vancouver CMA
# v_CA21_4237: Total private households by tenure
# v_CA21_4238: Owner households
housing_data <- get_census(
dataset = "CA21",
regions = list(CMA = "59933"), # Vancouver CMA
vectors = c(
total_hh = "v_CA21_4237", # Total households
owner_hh = "v_CA21_4238" # Owner households
),
level = "CT",
geo_format = "sf",
labels = "short",
quiet = TRUE
)
housing_data <- housing_data %>%
filter(!is.na(total_hh) & !is.na(owner_hh)) %>%
filter(total_hh > 50) %>% # Filter small tracts
mutate(
owner_rate = owner_hh / total_hh
)
cat("Number of Census Tracts:", nrow(housing_data), "\n")# Compute surprise on the home-ownership-rate distribution.
housing_surprise <- surprise(
housing_data,
observed = owner_rate,
models = c("uniform", "gaussian", "sampled")
)
housing_data$signed_surprise <- get_surprise(housing_surprise, "signed")
# Plot Vancouver
ggplot(housing_data) +
geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.02) +
scale_fill_surprise_diverging(name = "Signed\nsurprise") +
labs(
title = "Atypical Home Ownership Rates in Metro Vancouver",
subtitle = "Signed surprise from the tract-level owner-rate distribution",
caption = "Source: Statistics Canada, 2021 Census"
) +
theme_void() +
theme(
legend.position = "right",
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 9)
)Identify areas with high model-conditional surprise for non-official language speakers at home.
# Get language data for Toronto CMA
# v_CA21_1144: Total - Knowledge of official languages
# v_CA21_1153: Neither English nor French
language_data <- get_census(
dataset = "CA21",
regions = list(CMA = "35535"), # Toronto CMA
vectors = c(
total_pop = "v_CA21_1144", # Total population
no_official = "v_CA21_1153" # Neither official language
),
level = "CT",
geo_format = "sf",
labels = "short",
quiet = TRUE
)
language_data <- language_data %>%
filter(!is.na(total_pop) & !is.na(no_official)) %>%
filter(total_pop > 100) %>%
mutate(no_official_rate = no_official / total_pop)# Compute surprise on the non-official-language-rate distribution.
lang_surprise <- surprise(
language_data,
observed = no_official_rate,
models = c("uniform", "gaussian", "sampled")
)
language_data$signed_surprise <- get_surprise(lang_surprise, "signed")
# Plot Toronto
ggplot(language_data) +
geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.02) +
scale_fill_surprise_diverging(name = "Signed\nsurprise") +
labs(
title = "Atypical Non-Official Language Concentrations in Toronto",
subtitle = "Signed surprise from the tract-level non-official-language-rate distribution",
caption = "Source: Statistics Canada, 2021 Census"
) +
theme_void() +
theme(
legend.position = "right",
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 9)
)Compare provinces using a funnel plot to account for population size variation.
# Get provincial data
provincial_data <- get_census(
dataset = "CA21",
regions = list(C = "01"),
vectors = c(
total_pop = "v_CA21_1085", # Total for low income
low_income = "v_CA21_1090" # In low income
),
level = "PR",
geo_format = NA,
labels = "short",
quiet = TRUE
)
provincial_data <- provincial_data %>%
filter(!is.na(total_pop) & total_pop > 0)
# Compute funnel data for points
funnel_df <- compute_funnel_data(
observed = provincial_data$low_income,
sample_size = provincial_data$total_pop,
type = "count",
limits = c(2, 3)
)
funnel_df$province <- provincial_data$`Region Name`
funnel_df$rate <- funnel_df$observed / funnel_df$sample_size
# National rate
national_rate <- sum(provincial_data$low_income) / sum(provincial_data$total_pop)
# Create continuous funnel bands
n_range <- range(provincial_data$total_pop)
n_seq <- exp(seq(log(n_range[1] * 0.8), log(n_range[2] * 1.2), length.out = 200))
se_seq <- sqrt(national_rate * (1 - national_rate) / n_seq)
funnel_bands <- data.frame(
n = n_seq,
rate = national_rate,
lower_2sd = national_rate - 2 * se_seq,
upper_2sd = national_rate + 2 * se_seq,
lower_3sd = national_rate - 3 * se_seq,
upper_3sd = national_rate + 3 * se_seq
)
# Create funnel plot
ggplot() +
# 3 SD band (outer)
geom_ribbon(
data = funnel_bands,
aes(x = n, ymin = lower_3sd, ymax = upper_3sd),
fill = "#9ecae1", alpha = 0.5
) +
# 2 SD band (inner)
geom_ribbon(
data = funnel_bands,
aes(x = n, ymin = lower_2sd, ymax = upper_2sd),
fill = "#3182bd", alpha = 0.5
) +
# National average line
geom_hline(
yintercept = national_rate,
linetype = "dashed", color = "#636363", linewidth = 0.8
) +
# Points
geom_point(
data = funnel_df,
aes(x = sample_size, y = rate, color = abs(z_score) > 2),
size = 4
) +
# Labels
ggrepel::geom_text_repel(
data = funnel_df,
aes(x = sample_size, y = rate, label = province),
size = 3, fontface = "bold",
max.overlaps = 15,
segment.color = "gray60"
) +
scale_color_manual(
values = c("FALSE" = "#636363", "TRUE" = "#e6550d"),
guide = "none"
) +
scale_x_log10(
labels = scales::comma,
breaks = c(1e4, 1e5, 1e6, 1e7)
) +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Funnel Plot: Provincial Low Income Rates",
subtitle = "Provinces outside bands have surprising rates given population size",
x = "Population (log scale)",
y = "Low Income Rate",
caption = "Source: Statistics Canada, 2021 Census"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)Compare surprise patterns between 2016 and 2021 Census at the provincial level.
# Get provincial data for 2016 and 2021
# Note: Variable IDs differ between Census years
# 2021 data
prov_2021 <- get_census(
dataset = "CA21",
regions = list(C = "01"),
vectors = c(total_pop = "v_CA21_1085", low_income = "v_CA21_1090"),
level = "PR",
geo_format = NA,
labels = "short",
quiet = TRUE
)
prov_2021 <- prov_2021[!is.na(prov_2021$total_pop) & prov_2021$total_pop > 0, ]
prov_2021$low_income_rate <- prov_2021$low_income / prov_2021$total_pop
prov_2021$census_year <- 2021
# 2016 data (different vector IDs for same concept)
prov_2016 <- get_census(
dataset = "CA16",
regions = list(C = "01"),
vectors = c(total_pop = "v_CA16_2540", low_income = "v_CA16_2555"),
level = "PR",
geo_format = NA,
labels = "short",
quiet = TRUE
)
prov_2016 <- prov_2016[!is.na(prov_2016$total_pop) & prov_2016$total_pop > 0, ]
prov_2016$low_income_rate <- prov_2016$low_income / prov_2016$total_pop
prov_2016$census_year <- 2016
# Compute surprise for each year's provincial low-income-rate distribution
surprise_2021 <- surprise(
prov_2021,
observed = low_income_rate,
models = c("uniform", "gaussian", "sampled")
)
prov_2021$signed_surprise <- get_surprise(surprise_2021, "signed")
surprise_2016 <- surprise(
prov_2016,
observed = low_income_rate,
models = c("uniform", "gaussian", "sampled")
)
prov_2016$signed_surprise <- get_surprise(surprise_2016, "signed")
# Compare changes
comparison <- merge(
prov_2016[, c("GeoUID", "Region Name", "signed_surprise")],
prov_2021[, c("GeoUID", "signed_surprise")],
by = "GeoUID",
suffixes = c("_2016", "_2021")
)
comparison$change <- comparison$signed_surprise_2021 - comparison$signed_surprise_2016
print(comparison[order(abs(comparison$change), decreasing = TRUE), ])Note: This example is not evaluated because it requires multiple API calls. Run locally with your CensusMapper API key.
For Census data, create a custom model space that matches the quantity being modeled. Here the observed values are home-ownership rates, so the models are rate-distribution models rather than population-baseline count models.
# Using the Vancouver housing data
# Create a custom model space
# Create custom model space for rates
housing_models <- model_space(
bs_model_gaussian(),
bs_model_sampled()
)
print(housing_models)
# Use surprise() with custom model space
custom_result <- surprise(
housing_data,
observed = owner_rate,
models = housing_models
)
housing_data$custom_surprise <- get_surprise(custom_result, "signed")
# Summary statistics
cat("Custom model surprise summary:\n")
summary(housing_data$custom_surprise)Understanding what surprise values mean is crucial for proper interpretation:
| Surprise Magnitude | Interpretation | Meaning |
|---|---|---|
| < 0.1 | Trivial | Little update from prior to posterior model probabilities |
| 0.1 - 0.3 | Minor | Small model update |
| 0.3 - 0.5 | Moderate | Noticeable model update |
| 0.5 - 1.0 | Substantial | Strong model update |
| > 1.0 | High | Very strong model update under the chosen model space |
Key insight: Surprise measures how much the data discriminates between models, not just how far a rate is from average. A region with an unusual rate may show low surprise if all models assign similar likelihood to it. A region with a rate near the average can still show surprise if it falls in a part of the distribution where the chosen models disagree. Use the funnel plot separately to understand whether population size makes a rate estimate unusually reliable or noisy.
# Categorize regions by surprise level
income_data <- income_data %>%
mutate(
surprise_category = cut(
abs(signed_surprise),
breaks = c(0, 0.1, 0.3, 0.5, 1.0, Inf),
labels = c("Trivial", "Minor", "Moderate", "Substantial", "High"),
include.lowest = TRUE
)
)
# Summary by category
cat("Census Divisions by surprise level:\n")
print(table(income_data$surprise_category))sessionInfo()
#> R version 4.5.0 (2025-04-11)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Sequoia 15.4.1
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
#>
#> locale:
#> [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
#>
#> time zone: America/Los_Angeles
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] dplyr_1.1.4 ggplot2_4.0.1 sf_1.0-23
#> [4] cancensus_0.5.7 bayesiansurpriser_0.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.0 Rcpp_1.1.0
#> [5] tidyselect_1.2.1 jquerylib_0.1.4 scales_1.4.0 yaml_2.3.10
#> [9] fastmap_1.2.0 R6_2.6.1 generics_0.1.4 classInt_0.4-11
#> [13] knitr_1.50 MASS_7.3-65 tibble_3.3.1 units_1.0-0
#> [17] DBI_1.2.3 bslib_0.9.0 pillar_1.11.1 RColorBrewer_1.1-3
#> [21] rlang_1.1.7 cachem_1.1.0 xfun_0.52 sass_0.4.10
#> [25] S7_0.2.1 cli_3.6.5 withr_3.0.2 magrittr_2.0.4
#> [29] class_7.3-23 digest_0.6.39 grid_4.5.0 lifecycle_1.0.5
#> [33] vctrs_0.7.0 KernSmooth_2.23-26 proxy_0.4-28 evaluate_1.0.4
#> [37] glue_1.8.0 farver_2.1.2 e1071_1.7-17 rmarkdown_2.29
#> [41] tools_4.5.0 pkgconfig_2.0.3 htmltools_0.5.8.1