Spatial Data Workflows with sf

library(bayesiansurpriser)
library(sf)
library(ggplot2)

Overview

The bayesiansurpriser package is designed for seamless integration with the sf package for spatial data. This vignette demonstrates workflows for analyzing spatial data.

Basic sf Workflow

Loading Spatial Data

# Load the NC SIDS dataset
nc <- st_read(system.file("shape/nc.shp", package = "sf"), quiet = TRUE)

# Examine the data
head(nc[, c("NAME", "SID74", "BIR74", "SID79", "BIR79")])
#> Simple feature collection with 6 features and 5 fields
#> Geometry type: MULTIPOLYGON
#> Dimension:     XY
#> Bounding box:  xmin: -81.74107 ymin: 36.07282 xmax: -75.77316 ymax: 36.58965
#> Geodetic CRS:  NAD27
#>          NAME SID74 BIR74 SID79 BIR79                       geometry
#> 1        Ashe     1  1091     0  1364 MULTIPOLYGON (((-81.47276 3...
#> 2   Alleghany     0   487     3   542 MULTIPOLYGON (((-81.23989 3...
#> 3       Surry     5  3188     6  3616 MULTIPOLYGON (((-80.45634 3...
#> 4   Currituck     1   508     2   830 MULTIPOLYGON (((-76.00897 3...
#> 5 Northampton     9  1421     3  1606 MULTIPOLYGON (((-77.21767 3...
#> 6    Hertford     7  1452     5  1838 MULTIPOLYGON (((-76.74506 3...

Computing Surprise

The surprise() function automatically detects sf objects and uses the appropriate method:

result <- surprise(nc, observed = SID74, expected = BIR74)

# Result is an sf object with surprise columns added
class(result)
#> [1] "bs_surprise_sf" "sf"             "data.frame"
names(result)[grepl("surprise", names(result))]
#> [1] "surprise"        "signed_surprise"

Convenience Function: st_surprise

For explicit sf workflows, use st_surprise():

result2 <- st_surprise(nc, observed = SID74, expected = BIR74)

# Identical results
all.equal(result$surprise, result2$surprise)
#> [1] TRUE

Accessing Results

Extracting Surprise Values

# Get surprise values
surprise_vals <- get_surprise(result, "surprise")
head(surprise_vals)
#> [1] 0.1952858 0.1107616 0.2909684 0.1171487 0.5777373 0.5143742

# Get signed surprise values
signed_vals <- get_surprise(result, "signed")
head(signed_vals)
#> [1] -0.1952858 -0.1107616 -0.2909684 -0.1171487  0.5777373  0.5143742

Accessing the Model Space

mspace <- get_model_space(result)
print(mspace)
#> <bs_model_space>
#>   Models: 3 
#>    1. Uniform (prior: 0.3333)
#>    2. Base Rate (prior: 0.3333)
#>    3. de Moivre Funnel (paper) (prior: 0.3333)

Working with the sf Object

The result is a full sf object, so all sf operations work:

# Filter high-surprise regions
high_surprise <- result[result$surprise > median(result$surprise), ]
cat("Regions with above-median surprise:", nrow(high_surprise), "\n")
#> Regions with above-median surprise: 50

# Compute centroids
centroids <- st_centroid(result)
#> Warning: st_centroid assumes attributes are constant over geometries

Visualization

ggplot2 Integration

ggplot(result) +
  geom_sf(aes(fill = surprise)) +
  scale_fill_surprise() +
  labs(title = "Bayesian Surprise: NC SIDS 1974")

Spatial map of Bayesian surprise values for an sf object.

ggplot(result) +
  geom_sf(aes(fill = signed_surprise)) +
  scale_fill_surprise_diverging() +
  labs(title = "Signed Surprise",
       subtitle = "Red = over-representation, Blue = under-representation")

Spatial map of Bayesian surprise values for an sf object.

Comparing Time Periods

# Compute surprise for two time periods
result_74 <- surprise(nc, observed = SID74, expected = BIR74)
result_79 <- surprise(nc, observed = SID79, expected = BIR79)

# Compare
nc$surprise_74 <- result_74$surprise
nc$surprise_79 <- result_79$surprise
nc$surprise_change <- nc$surprise_79 - nc$surprise_74

# Plot change
ggplot(nc) +
  geom_sf(aes(fill = surprise_change)) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red",
                       name = "Change in\nSurprise") +
  labs(title = "Change in Surprise: 1974 to 1979")

Spatial map of Bayesian surprise values for an sf object.

Advanced: Custom Model Spaces

# Create sophisticated model space
custom_space <- model_space(
  bs_model_uniform(),
  bs_model_baserate(nc$BIR74),
  bs_model_gaussian(),
  bs_model_funnel(nc$BIR74),
  prior = c(0.1, 0.3, 0.2, 0.4)
)

result_custom <- surprise(nc, observed = SID74, expected = BIR74,
                          models = custom_space)

# Compute a global posterior model update explicitly if needed
updated_space <- bayesian_update(custom_space, nc$SID74)
cat("Prior:", custom_space$prior, "\n")
#> Prior: 0.1 0.3 0.2 0.4
cat("Posterior:", updated_space$posterior, "\n")
#> Posterior: 0.1986683 0.7875552 2.028479e-151 0.01377646

Integration with dplyr

library(dplyr)

# Pipe-friendly workflow
nc %>%
  st_surprise(observed = SID74, expected = BIR74) %>%
  filter(surprise > 0.5) %>%
  select(NAME, surprise, signed_surprise, geometry)

Tips for Large Datasets

  1. Pre-compute expected values: If expected values are expensive to compute, store them
  2. Use appropriate models: For very large datasets, the Sampled model with smaller sample_frac can be faster
  3. Parallel processing: The package is compatible with future/furrr for parallel computation
# For large spatial datasets
result <- surprise(large_sf,
                  observed = cases,
                  expected = population,
                  models = model_space(
                    bs_model_uniform(),
                    bs_model_baserate(large_sf$population)
                  ))