# function to simulate data
simulate_data <- function(truth, covariates, n, random.seed, sigma_y = 1) {
# set seed
random.seed <- list(seed = random.seed, kind = "Mersenne-Twister", normal.kind = "Inversion", sample.kind = "Rounding")
suppressWarnings(do.call("set.seed", random.seed))
# sample data n = 100, 500, 2500
selected_sample <- sample(seq_along(truth$mu.0), n, replace = FALSE)
mu.0 = truth$mu.0[selected_sample]
mu.1 = truth$mu.1[selected_sample]
x = covariates[selected_sample,]
# assign treatment
n1=round(n/2)
n0=n-n1
zind=sample(1:n,size=n1)
z=numeric(n)
z[zind]=1
mu = z * mu.1 + (1-z) * mu.0
y <- mu + sigma_y*rnorm(n)
return(
list(
x = x,
z = z,
y = y,
mu.1 = mu.1,
mu.0 = mu.0
)
)
}
# function to get true GATE
trueGATE <- function(truth, tau, ngates = 5) {
n = length(tau)
fd_label = ntile(tau, ngates)
gates = numeric(ngates)
for (i in 1:ngates) {
gates[i] = mean(truth$mu.1[fd_label == i] - truth$mu.0[fd_label == i])
}
return(gates)
}
# function to get true GATE via cross-fitting
compute_true_GATE <- function(truth, input_x, full_data, n_sample, nfolds, ngates, iter) {
n_train = round(n_sample*(nfolds-1)/nfolds)
train_data = simulate_data(truth, input_x, n_train, iter)
cf_train = causal_forest(
model.matrix(~.-1, data = train_data$x),
train_data$y,
train_data$z,
num.trees = 10,
num.threads = 1
)
tau_test = predict(cf_train, newdata = model.matrix(~.-1, data = full_data$x))$predictions
gate_test = trueGATE(full_data, tau_test, ngates)
return(tibble(
iter = iter,
group = 1:ngates,
true_gate = gate_test # FIX: Rename here to match your join logic
))
}
make_synth_acic_population <- function(n_pop = 5000, seed = 1,
p_cont = 10, p_bin = 10,
sigma_mu = 1) {
set.seed(seed)
# correlated continuous covariates
rho <- 0.4
Sigma <- rho ^ abs(outer(seq_len(p_cont), seq_len(p_cont), "-"))
Xc <- matrix(rnorm(n_pop * p_cont), n_pop, p_cont) %*% chol(Sigma)
colnames(Xc) <- paste0("x", seq_len(p_cont))
# binary covariates correlated with continuous ones
Xb <- matrix(0, n_pop, p_bin)
colnames(Xb) <- paste0("b", seq_len(p_bin))
for (j in seq_len(p_bin)) {
lin <- 0.6 * Xc[, ((j - 1) %% p_cont) + 1] - 0.3 * Xc[, ((j) %% p_cont) + 1] + rnorm(n_pop, sd = 0.5)
pr <- plogis(lin)
Xb[, j] <- rbinom(n_pop, 1, pr)
}
# a couple categorical covariates (factors)
cut3 <- cut(Xc[, 3], breaks = quantile(Xc[, 3], probs = c(0, 1/3, 2/3, 1)),
include.lowest = TRUE, labels = c("A", "B", "C"))
cut4 <- cut(Xc[, 4], breaks = quantile(Xc[, 4], probs = c(0, 0.25, 0.5, 0.75, 1)),
include.lowest = TRUE, labels = c("L1", "L2", "L3", "L4"))
x <- data.frame(
Xc,
Xb,
cat1 = factor(cut3),
cat2 = factor(cut4)
)
# define mu0(x) and heterogeneous tau(x) with nonlinearities and interactions
mu0 <- 0.5 * Xc[, 1] -
0.25 * (Xc[, 2]^2) +
sin(Xc[, 3]) +
0.6 * Xb[, 1] +
0.4 * (cut3 == "B") -
0.3 * (cut3 == "C") +
0.5 * Xc[, 5] * Xb[, 2] +
0.3 * pmax(Xc[, 6], 0)
tau <- 0.2 +
0.35 * Xc[, 1] +
0.25 * (Xc[, 7] > 0) +
0.2 * Xb[, 3] -
0.25 * Xc[, 2] * Xb[, 1] +
0.3 * (cut4 %in% c("L3", "L4")) +
0.15 * cos(Xc[, 8])
mu0 <- sigma_mu * mu0
tau <- sigma_mu * tau
mu1 <- mu0 + tau
truth <- list(
mu.0 = as.numeric(mu0),
mu.1 = as.numeric(mu1)
)
list(x = x, truth = truth)
}
# function to compute GATE bounds with michael's original code
compute_GATE_bounds <- function(truth, input_x, n_sample, nfolds, ngates, n_iter) {
# get training data
full_data = simulate_data(truth, input_x, n_sample, n_iter)
# create folds
folds = caret::createFolds(full_data$y, k = nfolds)
indcv = numeric(n_sample)
tauCV = matrix(0, nrow = n_sample, ncol = nfolds)
for (i in 1:nfolds) {
x_full = model.matrix(~.-1, data = full_data$x)
x_train = x_full[-folds[[i]], ]
y_train = full_data$y[-folds[[i]]]
z_train = full_data$z[-folds[[i]]]
x_test = x_full[folds[[i]], ]
y_test = full_data$y[folds[[i]]]
z_test = full_data$z[folds[[i]]]
# fit causal forest
cf_train = causal_forest(
x_train,
y_train,
z_train,
num.trees = 20,
num.threads = 1
)
# compute tau
tauCV[, i] = predict(cf_train, x_full)$predictions
indcv[folds[[i]]] = rep(i, nrow(x_test))
}
# compute GATE
output = GATEcv(full_data$z, tauCV, full_data$y, indcv, ngates = ngates)
return(
list(
group = 1:ngates,
gate = output$gate,
sd = output$sd,
upper = output$gate + 1.96 * output$sd,
lower = output$gate - 1.96 * output$sd
)
)
}
# function to compute GATE bounds with evalHTE package
compute_GATE_bounds_pkg <- function(truth, input_x, n_sample, nfolds, ngates, n_iter, algs) {
# get training data
full_data = simulate_data(truth, input_x, n_sample, n_iter)
# create data
df = data.frame(
full_data$x,
y = full_data$y,
z = full_data$z
)
# specify formula
cov_names = input_x %>% names()
user_formula <- paste0("y ~ z*(", paste(cov_names, collapse = " + "), ")")
# run with evalITR package
fit <- estimate_hte(
treatment = "z",
form = user_formula,
data = df,
algorithms = algs,
n_folds = nfolds,
meta_learner = "slearner")
# evaluate HTE
est <- evaluate_hte(fit)
gate_pkg <- summary(est)$GATE %>%
mutate(iter = n_iter)
return(gate_pkg)
}
# function to loop over n_sim for true GATE
run_simluation_gate <- function(truth, input_x, full_data, n_sample, nfolds, ngates, n_sim) {
# loop over n_sim
results = future_map(1:n_sim, ~compute_true_GATE(truth, input_x, full_data, n_sample, nfolds, ngates, .x),
.options = furrr_options(seed = 123)
) %>% bind_rows()
return(results)
}
# function to loop over n_sim for gate sd
run_simluation_gate_sd <- function(truth, input_x, n_sample, nfolds, ngates, n_sim, pkg = FALSE, algs = c("causal_forest")) {
# if pkg == FALSE, compute GATE bounds using michael's original code
if (pkg == FALSE) {
results = future_map(1:n_sim, ~compute_GATE_bounds(truth, input_x, n_sample, nfolds, ngates, .x),
.options = furrr_options(seed = 123)
) %>% bind_rows()
} else {
# if pkg == TRUE, compute GATE bounds using evalITR package
results = future_map(1:n_sim, ~compute_GATE_bounds_pkg(truth, input_x, n_sample, nfolds, ngates, .x, algs = algs),
.options = furrr_options(seed = 123)
) %>% bind_rows()
}
return(results)
}
Sys.setenv(
OMP_NUM_THREADS = 1,
OPENBLAS_NUM_THREADS = 1,
MKL_NUM_THREADS = 1,
VECLIB_MAXIMUM_THREADS = 1
)
library(evalHTE)
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(future)
future::plan(future::sequential)
library(furrr)
library(magrittr)
# load ACIC 2016 data
library(grf) # causal_forest
options(grf.num.threads = 1)
library(dplyr) # ntile, left_join, mutate, summarise
library(tibble)
synth <- make_synth_acic_population(n_pop = 5000, seed = 28, p_cont = 10, p_bin = 10)
input_x <- synth$x
truth <- synth$truth
full_data <- simulate_data(truth, input_x, nrow(input_x), 2021)
# parameters
n_sample = 50
n_sim = 5
n_folds = 2
n_gates = 2
# compute true GATE
results_gate = run_simluation_gate(truth, input_x, full_data, n_sample, n_folds, n_gates, n_sim)
# compute GATE bounds
results_sd_pkg = run_simluation_gate_sd(truth, input_x, n_sample, n_folds, n_gates, n_sim, pkg = TRUE, algs = c("causal_forest"))
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
# check coverage
results_sd_pkg %>%
left_join(results_gate, by = c("iter", "group")) %>%
mutate(
upper = estimate + 1.96 * std.deviation,
lower = estimate - 1.96 * std.deviation,
coverage = ((upper > true_gate) & (lower < true_gate)) * 1
) %>%
summarise(
coverage = mean(coverage, na.rm = TRUE)
)
#> # A tibble: 1 × 1
#> coverage
#> <dbl>
#> 1 0.9