## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment  = "#>",
  fig.width  = 7,
  fig.height = 4.5
)
set.seed(1234)

## ----load---------------------------------------------------------------------
library(blockwise)
data(bike)
str(bike, list.len = 20)

## ----mask---------------------------------------------------------------------
bike_miss <- simulate_blockwise_missing(
  bike,
  blocks = list(
    c("windspeed", "hum", "weekday"),
    c("hr", "temp", "weathersit")
  ),
  prop_missing = 0.30,
  noise        = 0.05
)

round(colMeans(is.na(bike_miss)) * 100, 1)  # percent missing per column

## ----split--------------------------------------------------------------------
set.seed(1234)
idx <- sample(nrow(bike_miss), size = floor(0.75 * nrow(bike_miss)))
train <- bike_miss[idx, ]
test  <- bike_miss[-idx, ]

X_train <- train[, setdiff(names(train), "cnt")]
y_train <- train$cnt

X_test  <- test[, setdiff(names(test), "cnt")]
y_test  <- test$cnt

## ----fit-lm-------------------------------------------------------------------
set.seed(1234)
fit_lm <- brm(X_train, y_train, learner = learner_lm())
fit_lm

## ----fit-gbm, eval = requireNamespace("gbm", quietly = TRUE)------------------
# fit_gbm <- brm(
#   X_train, y_train,
#   learner  = learner_gbm(distribution = "poisson", n.trees = 300),
#   n_blocks = fit_lm$n_blocks     # reuse so models are comparable
# )
# fit_gbm

## ----rmse---------------------------------------------------------------------
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))

pred_lm <- predict(fit_lm, X_test)
cat("BRM (lm)   RMSE:", round(rmse(y_test, pred_lm), 2), "\n")

## ----rmse-gbm, eval = requireNamespace("gbm", quietly = TRUE)-----------------
# pred_gbm <- predict(fit_gbm, X_test)
# cat("BRM (gbm)  RMSE:", round(rmse(y_test, pred_gbm), 2), "\n")

## ----listwise-----------------------------------------------------------------
complete_train <- na.omit(train)
fit_lw <- lm(cnt ~ ., data = complete_train)

# For a fair comparison we need to impute the test set's NAs somehow; use
# mean/mode from the complete training rows.
X_test_imp <- X_test
for (j in names(X_test_imp)) {
  na_idx <- is.na(X_test_imp[[j]])
  if (!any(na_idx)) next
  ref <- complete_train[[j]]
  if (is.factor(ref)) {
    X_test_imp[[j]][na_idx] <- names(sort(table(ref), decreasing = TRUE))[1]
  } else {
    X_test_imp[[j]][na_idx] <- mean(ref, na.rm = TRUE)
  }
}
pred_lw <- predict(fit_lw, newdata = X_test_imp)

cat("Listwise-deletion lm  RMSE:", round(rmse(y_test, pred_lw), 2), "\n")
cat("Training rows used    : BRM =", nrow(train), " listwise =",
    nrow(complete_train), "\n")

