This vignette demonstrates all major functions in the
bayesiansurpriser package with working examples and
visualizations.
The primary function for computing Bayesian surprise on data frames and sf objects.
# Load NC SIDS data
nc <- st_read(system.file("shape/nc.shp", package = "sf"), quiet = TRUE)
# Compute surprise with default models (uniform, baserate, funnel)
result <- surprise(nc, observed = SID74, expected = BIR74)
# View structure
print(result)
#> Bayesian Surprise Map
#> =====================
#> Models: 3
#> Surprise range: 0.0334 to 0.6008
#> Signed surprise range: -0.5834 to 0.6008
#>
#> Simple feature collection with 100 features and 16 fields
#> Geometry type: MULTIPOLYGON
#> Dimension: XY
#> Bounding box: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
#> Geodetic CRS: NAD27
#> First 10 features:
#> AREA PERIMETER CNTY_ CNTY_ID NAME FIPS FIPSNO CRESS_ID BIR74 SID74
#> 1 0.114 1.442 1825 1825 Ashe 37009 37009 5 1091 1
#> 2 0.061 1.231 1827 1827 Alleghany 37005 37005 3 487 0
#> 3 0.143 1.630 1828 1828 Surry 37171 37171 86 3188 5
#> 4 0.070 2.968 1831 1831 Currituck 37053 37053 27 508 1
#> 5 0.153 2.206 1832 1832 Northampton 37131 37131 66 1421 9
#> 6 0.097 1.670 1833 1833 Hertford 37091 37091 46 1452 7
#> 7 0.062 1.547 1834 1834 Camden 37029 37029 15 286 0
#> 8 0.091 1.284 1835 1835 Gates 37073 37073 37 420 0
#> 9 0.118 1.421 1836 1836 Warren 37185 37185 93 968 4
#> 10 0.124 1.428 1837 1837 Stokes 37169 37169 85 1612 1
#> NWBIR74 BIR79 SID79 NWBIR79 geometry surprise
#> 1 10 1364 0 19 MULTIPOLYGON (((-81.47276 3... 0.19528584
#> 2 10 542 3 12 MULTIPOLYGON (((-81.23989 3... 0.11076159
#> 3 208 3616 6 260 MULTIPOLYGON (((-80.45634 3... 0.29096840
#> 4 123 830 2 145 MULTIPOLYGON (((-76.00897 3... 0.11714867
#> 5 1066 1606 3 1197 MULTIPOLYGON (((-77.21767 3... 0.57773733
#> 6 954 1838 5 1237 MULTIPOLYGON (((-76.74506 3... 0.51437420
#> 7 115 350 2 139 MULTIPOLYGON (((-76.00897 3... 0.04355317
#> 8 254 594 2 371 MULTIPOLYGON (((-76.56251 3... 0.08634903
#> 9 748 1190 2 844 MULTIPOLYGON (((-78.30876 3... 0.37427103
#> 10 160 2038 5 176 MULTIPOLYGON (((-80.02567 3... 0.31685289
#> signed_surprise
#> 1 -0.19528584
#> 2 -0.11076159
#> 3 -0.29096840
#> 4 -0.11714867
#> 5 0.57773733
#> 6 0.51437420
#> 7 -0.04355317
#> 8 -0.08634903
#> 9 0.37427103
#> 10 -0.31685289# Visualize the result
ggplot(result) +
geom_sf(aes(fill = signed_surprise)) +
scale_fill_surprise_diverging() +
labs(
title = "Bayesian Surprise: NC SIDS Cases (1974)",
subtitle = "Blue = fewer than expected, Red = more than expected",
fill = "Signed\nSurprise"
) +
theme_minimal()For quick analysis without a data frame:
# Observed counts and expected values
observed <- c(50, 100, 150, 200, 75, 300, 25)
expected <- c(10000, 50000, 100000, 25000, 15000, 80000, 5000)
result_auto <- auto_surprise(observed, expected)
# View surprise values
cat("Surprise values:\n")
#> Surprise values:
print(round(result_auto$surprise, 4))
#> [1] 0.5817 0.6163 0.6948 0.6256 0.5852 0.5850 0.5507
cat("\nSigned surprise:\n")
#>
#> Signed surprise:
print(round(result_auto$signed_surprise, 4))
#> [1] 0.5817 -0.6163 -0.6948 0.6256 0.5852 0.5850 0.5507# Visualize
df <- data.frame(
region = LETTERS[1:7],
observed = observed,
expected = expected,
surprise = result_auto$surprise,
signed = result_auto$signed_surprise
)
ggplot(df, aes(x = reorder(region, -surprise), y = surprise, fill = signed)) +
geom_col() +
scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
labs(
title = "Surprise by Region",
x = "Region", y = "Surprise (bits)",
fill = "Direction"
) +
theme_minimal()Direct computation with a model space:
# Build a model space
mspace <- model_space(
bs_model_uniform(),
bs_model_baserate(nc$BIR74),
bs_model_funnel(nc$BIR74)
)
# Compute surprise directly
result_low <- compute_surprise(
model_space = mspace,
observed = nc$SID74,
expected = nc$BIR74,
return_signed = TRUE
)
cat("Computed", length(result_low$surprise), "surprise values\n")
#> Computed 100 surprise values
cat("Range:", round(range(result_low$surprise), 4), "\n")
#> Range: 0.0334 0.6008Assumes all observations equally likely:
Expects observations proportional to baseline:
# Fit from data (default)
model_gaussian_fit <- bs_model_gaussian()
print(model_gaussian_fit)
#> <bs_model: gaussian >
#> Name: Gaussian
#> Parameters:
#> mu :
#> sigma :
#> fit_from_data : TRUE
# Fixed parameters
model_gaussian_fixed <- bs_model_gaussian(mu = 100, sigma = 20, fit_from_data = FALSE)
print(model_gaussian_fixed)
#> <bs_model: gaussian >
#> Name: Gaussian
#> Parameters:
#> mu : 100
#> sigma : 20
#> fit_from_data : FALSEUses kernel density estimation:
Accounts for sampling variance:
# Define a two-component mixture
model_mix <- bs_model_gaussian_mixture(
means = c(50, 150),
sds = c(10, 20),
weights = c(0.6, 0.4)
)
print(model_mix)
#> <bs_model: gaussian_mixture >
#> Name: Gaussian Mixture (k=2)
#> Parameters:
#> means : 50, 150
#> sds : 10, 20
#> weights : 0.6, 0.4
#> k : 2# Add a model
space_expanded <- add_model(space, bs_model_gaussian())
cat("After add_model:", n_models(space_expanded), "models\n")
#> After add_model: 4 models
cat("Model names:", model_names(space_expanded), "\n")
#> Model names: Uniform Base Rate de Moivre Funnel (paper) Gaussian
# Remove a model (by index)
space_reduced <- remove_model(space_expanded, 4)
cat("After remove_model:", n_models(space_reduced), "models\n")
#> After remove_model: 3 models
# Set new prior
space_reweighted <- set_prior(space, c(0.1, 0.6, 0.3))
cat("New prior:", space_reweighted$prior, "\n")
#> New prior: 0.1 0.6 0.3
# Get model names
cat("Model names:", model_names(space), "\n")
#> Model names: Uniform Population Funnel# Update model space with observed data
updated_space <- bayesian_update(space, nc$SID74)
cat("Prior: ", round(space$prior, 4), "\n")
#> Prior: 0.2 0.5 0.3
cat("Posterior:", round(updated_space$posterior, 4), "\n")
#> Posterior: 0.231 0.763 0.006space_simple <- model_space(bs_model_uniform(), bs_model_gaussian())
# Sequential observations (processed one at a time)
observations <- c(10, 15, 20, 25, 100, 30, 35, 40)
cum_result <- cumulative_bayesian_update(space_simple, observations)
cat("Observations processed:", length(cum_result$cumulative_surprise), "\n")
#> Observations processed: 8
cat("Final posterior:", round(cum_result$final_space$posterior, 4), "\n")
#> Final posterior: 1 0result <- surprise(nc, observed = SID74, expected = BIR74)
# Get surprise values
surprise_vals <- get_surprise(result, "surprise")
signed_vals <- get_surprise(result, "signed")
cat("First 5 surprise values:", round(surprise_vals[1:5], 4), "\n")
#> First 5 surprise values: 0.1953 0.1108 0.291 0.1171 0.5777
cat("First 5 signed values:", round(signed_vals[1:5], 4), "\n")
#> First 5 signed values: -0.1953 -0.1108 -0.291 -0.1171 0.5777
# Get model space
mspace <- get_model_space(result)
print(mspace)
#> <bs_model_space>
#> Models: 3
#> 1. Uniform (prior: 0.3333)
#> 2. Base Rate (prior: 0.3333)
#> 3. de Moivre Funnel (paper) (prior: 0.3333)# Create panel data
set.seed(42)
panel_data <- data.frame(
year = rep(2018:2022, each = 4),
region = rep(c("North", "South", "East", "West"), 5),
cases = c(45, 80, 55, 70, # 2018
50, 85, 60, 65, # 2019
48, 90, 58, 72, # 2020
55, 95, 52, 68, # 2021
60, 100, 65, 75), # 2022
population = rep(c(10000, 20000, 15000, 18000), 5)
)
temporal_result <- surprise_temporal(
panel_data,
time_col = year,
observed = cases,
expected = population,
region_col = region
)
print(temporal_result)
#> <bs_surprise_temporal>
#> Time periods: 5
#> Time range: 2018 to 2022
#> Models: 3# Initial computation
initial <- auto_surprise(c(50, 100, 75), c(10000, 20000, 15000))
cat("Initial surprise:", round(initial$surprise, 4), "\n")
#> Initial surprise: 0.4312 0.4656 0.4523
# Stream new observations
updated <- update_surprise(initial,
new_observed = c(55, 110),
new_expected = c(11000, 22000))
cat("After update:", round(updated$surprise, 4), "\n")
#> After update: 0.4312 0.4656 0.4523 0.4364 0.4698# Time series data
ts_data <- c(50, 52, 48, 55, 51, 120, 115, 125, 60, 58, 62, 55)
expected_ts <- rep(1000, length(ts_data))
rolling_result <- surprise_rolling(
ts_data,
expected = expected_ts,
window_size = 4,
step = 1
)
# Extract mean surprise per window
window_means <- sapply(rolling_result$windows, function(w) mean(w$result$surprise))
plot(seq_along(window_means), window_means, type = "b",
xlab = "Window Position", ylab = "Mean Surprise",
main = "Rolling Window Surprise", pch = 19)
abline(h = mean(window_means), lty = 2, col = "red")anim_data <- surprise_animate(temporal_result)
head(anim_data)
#> time surprise signed_surprise region
#> 1 2018 0.4866159 0.4866159 North
#> 2 2018 0.4548098 0.4548098 South
#> 3 2018 0.4697058 -0.4697058 East
#> 4 2018 0.4470164 -0.4470164 West
#> 5 2019 0.5360584 0.5360584 North
#> 6 2019 0.4604919 0.4604919 Southfunnel_df <- compute_funnel_data(
observed = nc$SID74,
sample_size = nc$BIR74,
type = "count",
limits = c(2, 3) # 2 and 3 SD limits
)
head(funnel_df)
#> observed sample_size expected se z_score lower_2sd upper_2sd
#> 1 1 1091 2.2053964 1.4850577 -0.8116832 -0.7647190 5.175512
#> 2 0 487 0.9844437 0.9921913 -0.9921913 -0.9999390 2.968826
#> 3 5 3188 6.4443663 2.5385756 -0.5689672 1.3672150 11.521518
#> 4 1 508 1.0268940 1.0133578 -0.0265395 -0.9998216 3.053610
#> 5 9 1421 2.8724732 1.6948372 3.6154073 -0.5172012 6.262148
#> 6 7 1452 2.9351380 1.7132244 2.3726384 -0.4913109 6.361587
#> lower_3sd upper_3sd
#> 1 -2.249777 6.660569
#> 2 -1.992130 3.961018
#> 3 -1.171361 14.060093
#> 4 -2.013179 4.066967
#> 5 -2.212038 7.956985
#> 6 -2.204535 8.074811# Funnel plot - shows observed vs expected with control limits
ggplot(funnel_df, aes(x = sample_size, y = observed)) +
geom_ribbon(aes(ymin = lower_3sd, ymax = upper_3sd), fill = "gray90") +
geom_ribbon(aes(ymin = lower_2sd, ymax = upper_2sd), fill = "gray70") +
geom_line(aes(y = expected), linetype = "dashed") +
geom_point(aes(color = abs(z_score) > 2), size = 2) +
scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red")) +
labs(
title = "Funnel Plot: NC SIDS Cases",
x = "Births (Sample Size)",
y = "Observed SIDS Cases",
color = "Outside 2 SD"
) +
theme_minimal()# Compute z-scores (need observed, expected, and sample_size)
# For counts, expected is derived from overall rate
overall_rate <- sum(nc$SID74) / sum(nc$BIR74)
expected_cases <- nc$BIR74 * overall_rate
z_scores <- funnel_zscore(nc$SID74, expected_cases, nc$BIR74, type = "count")
cat("First 5 z-scores:", round(z_scores[1:5], 3), "\n")
#> First 5 z-scores: -0.812 -0.992 -0.569 -0.027 3.615
# Compute p-values from z-scores
p_vals <- funnel_pvalue(z_scores)
cat("First 5 p-values:", round(p_vals[1:5], 4), "\n")
#> First 5 p-values: 0.417 0.3211 0.5694 0.9788 3e-04
cat("Significant at 0.05:", sum(p_vals < 0.05, na.rm = TRUE), "counties\n")
#> Significant at 0.05: 15 countiesresult <- surprise(nc, observed = SID74, expected = BIR74)
ggplot(result) +
geom_sf(aes(fill = surprise)) +
scale_fill_surprise(option = "viridis") +
labs(title = "Sequential Scale (Viridis)") +
theme_minimal()ggplot(result) +
geom_sf(aes(fill = signed_surprise)) +
scale_fill_surprise_diverging() +
labs(title = "Diverging Scale for Signed Surprise") +
theme_minimal()ggplot(nc) +
stat_surprise(aes(geometry = geometry, observed = SID74, expected = BIR74)) +
scale_fill_surprise_diverging() +
labs(title = "stat_surprise() Computes Inline") +
theme_minimal()result <- surprise(nc, observed = SID74, expected = BIR74)
ggplot(result, aes(x = surprise)) +
geom_surprise_histogram(bins = 15) +
labs(title = "Distribution of Surprise Values") +
theme_minimal()ggplot(result, aes(x = signed_surprise)) +
geom_surprise_density(fill = "#4575b4", alpha = 0.7) +
geom_vline(xintercept = 0, linetype = "dashed") +
labs(title = "Density of Signed Surprise") +
theme_minimal()# st_density computes kernel density estimation for point patterns
# Returns a list with density grid
nc_centroids <- st_centroid(nc)
density_result <- st_density(nc_centroids, bandwidth = 0.5)
cat("Density result structure:\n")
#> Density result structure:
cat("- x grid:", length(density_result$x), "values\n")
#> - x grid: 100 values
cat("- y grid:", length(density_result$y), "values\n")
#> - y grid: 100 values
cat("- z matrix:", dim(density_result$z)[1], "x", dim(density_result$z)[2], "\n")
#> - z matrix: 100 x 100# Create larger regions to aggregate to
# Using a simple union of counties
nc_result <- surprise(nc, observed = SID74, expected = BIR74)
# Show surprise values from result
cat("Sample surprise values:\n")
#> Sample surprise values:
print(head(nc_result[, c("NAME", "surprise", "signed_surprise")], 5))
#> Simple feature collection with 5 features and 3 fields
#> Geometry type: MULTIPOLYGON
#> Dimension: XY
#> Bounding box: xmin: -81.74107 ymin: 36.07282 xmax: -75.77316 ymax: 36.58965
#> Geodetic CRS: NAD27
#> NAME surprise signed_surprise geometry
#> 1 Ashe 0.1952858 -0.1952858 MULTIPOLYGON (((-81.47276 3...
#> 2 Alleghany 0.1107616 -0.1107616 MULTIPOLYGON (((-81.23989 3...
#> 3 Surry 0.2909684 -0.2909684 MULTIPOLYGON (((-80.45634 3...
#> 4 Currituck 0.1171487 -0.1171487 MULTIPOLYGON (((-76.00897 3...
#> 5 Northampton 0.5777373 0.5777373 MULTIPOLYGON (((-77.21767 3...result <- surprise(nc, observed = SID74, expected = BIR74)
plot(result, which = "signed_surprise", main = "Base R Plot: Signed Surprise")auto_result <- auto_surprise(c(50, 100, 150, 200, 75), c(10000, 50000, 100000, 25000, 15000))
plot(auto_result, type = "histogram", main = "Surprise Distribution")result <- surprise(nc, observed = SID74, expected = BIR74)
summary(result)
#> AREA PERIMETER CNTY_ CNTY_ID
#> Min. :0.0420 Min. :0.999 Min. :1825 Min. :1825
#> 1st Qu.:0.0910 1st Qu.:1.324 1st Qu.:1902 1st Qu.:1902
#> Median :0.1205 Median :1.609 Median :1982 Median :1982
#> Mean :0.1263 Mean :1.673 Mean :1986 Mean :1986
#> 3rd Qu.:0.1542 3rd Qu.:1.859 3rd Qu.:2067 3rd Qu.:2067
#> Max. :0.2410 Max. :3.640 Max. :2241 Max. :2241
#> NAME FIPS FIPSNO CRESS_ID
#> Length:100 Length:100 Min. :37001 Min. : 1.00
#> Class :character Class :character 1st Qu.:37050 1st Qu.: 25.75
#> Mode :character Mode :character Median :37100 Median : 50.50
#> Mean :37100 Mean : 50.50
#> 3rd Qu.:37150 3rd Qu.: 75.25
#> Max. :37199 Max. :100.00
#> BIR74 SID74 NWBIR74 BIR79
#> Min. : 248 Min. : 0.00 Min. : 1.0 Min. : 319
#> 1st Qu.: 1077 1st Qu.: 2.00 1st Qu.: 190.0 1st Qu.: 1336
#> Median : 2180 Median : 4.00 Median : 697.5 Median : 2636
#> Mean : 3300 Mean : 6.67 Mean :1050.8 Mean : 4224
#> 3rd Qu.: 3936 3rd Qu.: 8.25 3rd Qu.:1168.5 3rd Qu.: 4889
#> Max. :21588 Max. :44.00 Max. :8027.0 Max. :30757
#> SID79 NWBIR79 geometry surprise
#> Min. : 0.00 Min. : 3.0 MULTIPOLYGON :100 Min. :0.03343
#> 1st Qu.: 2.00 1st Qu.: 250.5 epsg:4267 : 0 1st Qu.:0.22153
#> Median : 5.00 Median : 874.5 +proj=long...: 0 Median :0.31363
#> Mean : 8.36 Mean : 1352.8 Mean :0.32285
#> 3rd Qu.:10.25 3rd Qu.: 1406.8 3rd Qu.:0.43273
#> Max. :57.00 Max. :11631.0 Max. :0.60083
#> signed_surprise
#> Min. :-0.5834219
#> 1st Qu.:-0.2925958
#> Median :-0.1175466
#> Mean : 0.0001488
#> 3rd Qu.: 0.3426517
#> Max. : 0.6008295data(canada_mischief, package = "bayesiansurpriser")
cat("Canada mischief dataset:\n")
#> Canada mischief dataset:
cat("Rows:", nrow(canada_mischief), "\n")
#> Rows: 13
cat("Columns:", paste(names(canada_mischief), collapse = ", "), "\n")
#> Columns: name, population, mischief_count, rate_per_100k, pop_proportion, mischief_proportion
# Compute surprise on Canadian data
canada_result <- auto_surprise(
canada_mischief$mischief_count,
canada_mischief$population
)
cat("\nSurprise values by province:\n")
#>
#> Surprise values by province:
df_result <- data.frame(
province = canada_mischief$name,
surprise = round(canada_result$surprise, 4)
)
print(df_result[order(-df_result$surprise), ])
#> province surprise
#> 1 Ontario 0.6176
#> 2 Quebec 0.6019
#> 4 Alberta 0.5883
#> 3 British Columbia 0.5877
#> 13 Nunavut 0.5860
#> 7 Nova Scotia 0.5856
#> 8 New Brunswick 0.5855
#> 5 Manitoba 0.5853
#> 6 Saskatchewan 0.5851
#> 11 Northwest Territories 0.5851
#> 12 Yukon 0.5850
#> 10 Prince Edward Island 0.5543
#> 9 Newfoundland and Labrador 0.5516data(example_counties, package = "bayesiansurpriser")
cat("Example counties dataset:\n")
#> Example counties dataset:
cat("Class:", class(example_counties)[1], "\n")
#> Class: data.frame
cat("Rows:", nrow(example_counties), "\n")
#> Rows: 50
cat("Columns:", paste(names(example_counties), collapse = ", "), "\n")
#> Columns: county_id, name, population, events, expected, is_hotspot, is_coldspotsessionInfo()
#> R version 4.5.0 (2025-04-11)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Sequoia 15.4.1
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
#>
#> locale:
#> [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
#>
#> time zone: America/Los_Angeles
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] dplyr_1.1.4 ggplot2_4.0.1 sf_1.0-23
#> [4] cancensus_0.5.7 bayesiansurpriser_0.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.0 Rcpp_1.1.0
#> [5] tidyselect_1.2.1 jquerylib_0.1.4 scales_1.4.0 yaml_2.3.10
#> [9] fastmap_1.2.0 R6_2.6.1 labeling_0.4.3 generics_0.1.4
#> [13] classInt_0.4-11 s2_1.1.9 knitr_1.50 MASS_7.3-65
#> [17] tibble_3.3.1 units_1.0-0 DBI_1.2.3 bslib_0.9.0
#> [21] pillar_1.11.1 RColorBrewer_1.1-3 rlang_1.1.7 cachem_1.1.0
#> [25] xfun_0.52 sass_0.4.10 S7_0.2.1 viridisLite_0.4.2
#> [29] cli_3.6.5 withr_3.0.2 magrittr_2.0.4 wk_0.9.5
#> [33] class_7.3-23 digest_0.6.39 grid_4.5.0 lifecycle_1.0.5
#> [37] vctrs_0.7.0 KernSmooth_2.23-26 proxy_0.4-28 evaluate_1.0.4
#> [41] glue_1.8.0 farver_2.1.2 e1071_1.7-17 rmarkdown_2.29
#> [45] tools_4.5.0 pkgconfig_2.0.3 htmltools_0.5.8.1