r-causal
diff --git a/‎exercises/15-bonus-ml-for-causal-exercises.qmd‎
Lines changed: 15 additions & 9 deletions b/‎exercises/15-bonus-ml-for-causal-exercises.qmd‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎slides/raw/15-bonus-ml-for-causal.html‎
Lines changed: 1168 additions & 0 deletions b/‎slides/raw/15-bonus-ml-for-causal.html‎
Lines changed: 1168 additions & 0 deletions
diff --git a/‎slides/raw/15-bonus-ml-for-causal.qmd‎
Lines changed: 41 additions & 24 deletions b/‎slides/raw/15-bonus-ml-for-causal.qmd‎
Lines changed: 41 additions & 24 deletions
diff --git a/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/__packages‎
Lines changed: 25 additions & 0 deletions b/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/__packages‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-12_da8d078a315e69a5e98866ed4d7af84e.RData‎
3.05 KB b/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-12_da8d078a315e69a5e98866ed4d7af84e.RData‎
3.05 KB
diff --git a/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-12_da8d078a315e69a5e98866ed4d7af84e.rdb‎
8.18 MB b/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-12_da8d078a315e69a5e98866ed4d7af84e.rdb‎
8.18 MB
diff --git a/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-12_da8d078a315e69a5e98866ed4d7af84e.rdx‎
285 Bytes b/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-12_da8d078a315e69a5e98866ed4d7af84e.rdx‎
285 Bytes
diff --git a/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-14_cbdf470060ac7c5329f2c1e826614f4f.RData‎
3 KB b/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-14_cbdf470060ac7c5329f2c1e826614f4f.RData‎
3 KB
diff --git a/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-14_cbdf470060ac7c5329f2c1e826614f4f.rdb‎
14.3 MB b/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-14_cbdf470060ac7c5329f2c1e826614f4f.rdb‎
14.3 MB
diff --git a/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-14_cbdf470060ac7c5329f2c1e826614f4f.rdx‎
272 Bytes b/‎slides/raw/15-bonus-ml-for-causal_cache/revealjs/unnamed-chunk-14_cbdf470060ac7c5329f2c1e826614f4f.rdx‎
272 Bytes
@@ -179,7 +179,7 @@ ate_gcomp
 
 ## Your Turn 1
 
-1. First, create a character vector `sl_library` that specifies the following algorithms: "SL.glm", "SL.ranger", "SL.xgboost", "SL.gam". Then, Fit a SuperLearner for the exposure model using the `SuperLearner` package. The predictors for this model should be the confounders identified in the DAG: `park_ticket_season`, `park_close`, and `park_temperature_high`. The outcome is `park_extra_magic_morning`.
+1. First, create a character vector `sl_library` that specifies the following algorithms: "SL.glm", "SL.ranger", "SL.gam". Then, Fit a SuperLearner for the exposure model using the `SuperLearner` package. The predictors for this model should be the confounders identified in the DAG: `park_ticket_season`, `park_close`, and `park_temperature_high`. The outcome is `park_extra_magic_morning`.
 2. Fit a SuperLearner for the outcome model using the `SuperLearner` package. The predictors for this model should be the confounders plus the exposure: `park_extra_magic_morning`, `park_ticket_season`, `park_close`, and `park_temperature_high`. The outcome is `wait_minutes_posted_avg`.
 3. Inspect the fitted SuperLearner objects.
 
@@ -251,7 +251,6 @@ outcome_rmse
 sl_library_extended <- c(
   "SL.glm", 
   "SL.ranger", 
-  "SL.xgboost",
   "SL.earth",
   "SL.gam",
   "SL.glm.interaction",
@@ -310,16 +309,23 @@ tidy(ipw_model) |>
 ```{r}
 # G-computation with SuperLearner outcome model
 # Step 1: Create counterfactual datasets
-seven_dwarfs_clone <- seven_dwarfs |> 
-  mutate(park_close = as.numeric(park_close))
+# For SuperLearner prediction, we need only the columns used in the model
 
 # Dataset where everyone is treated, `park_extra_magic_morning` = 1
-data_all_treated <- seven_dwarfs_clone |> 
-  mutate(park_extra_magic_morning = ___)
+data_all_treated <- seven_dwarfs |> 
+  select(park_extra_magic_morning, park_ticket_season, park_close, park_temperature_high) |>
+  mutate(
+    park_close = as.numeric(park_close),
+    park_extra_magic_morning = ___
+  )
 
 # Dataset where everyone is control, `park_extra_magic_morning` = 0
-data_all_control <- seven_dwarfs_clone |> 
-  mutate(park_extra_magic_morning = ___)
+data_all_control <- seven_dwarfs |> 
+  select(park_extra_magic_morning, park_ticket_season, park_close, park_temperature_high) |>
+  mutate(
+    park_close = as.numeric(park_close),
+    park_extra_magic_morning = ___
+  )
 
 # Step 2: Predict outcomes under each scenario using SuperLearner
 pred_treated <- predict(______, newdata = ______)$pred[, 1]
@@ -350,7 +356,7 @@ outcome_sl_bounded <- SuperLearner(
   X = seven_dwarfs |> 
     select(__________, park_ticket_season, park_close, park_temperature_high) |>
     mutate(park_close = as.numeric(park_close)),
-  family = binomial(),
+  family = quasibinomial(),
   SL.library = __________,
   cvControl = list(V = 5)
 )
 
@@ -88,7 +88,6 @@ dag_base <- dag |>
     aes(x, y, xend = xend, yend = yend, color = status)
   ) +
   geom_dag_point() +
-  geom_dag_label_repel2(aes(label = label), seed = 1234) +
   scale_color_okabe_ito(na.value = "grey90") +
   theme_dag() +
   theme(legend.position = "none")
@@ -171,11 +170,15 @@ halfmoon::plot_mirror_distributions(
   ggokabeito::scale_fill_okabe_ito()
 ```
 
-## G-computation
+## G-computation {background-color="#23373B"}
 
 1. Fit a model for `y ~ x + z` where z is all confounders
 2. Create a duplicate of your data set for each level of `x`
 3. Set the value of x to a single value for each cloned data set (e.g `x = 1` for one, `x = 0` for the other)
+
+
+## G-computation {background-color="#23373B"}
+
 4. Make predictions using the model on the cloned data sets
 5. Calculate the estimate you want, e.g. `mean(x_1) - mean(x_0)`
 
@@ -258,6 +261,7 @@ dag_base +
 
 ```{r}
 #| echo: false
+#| fig-width: 10
 smk_wt_dag <- dagify(
   qsmk ~ sex + race + age + education + 
     smokeintensity + smokeyrs + exercise + active + wt71,
@@ -288,7 +292,7 @@ smk_wt_dag |>
   geom_dag_edges() +
   geom_dag_point() +
   geom_dag_label_repel(aes(label = label), seed = 1234) +
-  scale_color_okabe_ito(na.value = "grey90") +
+  scale_color_okabe_ito(na.value = "grey60") +
   theme_dag() +
   theme(legend.position = "none")
 ```
@@ -397,14 +401,18 @@ dagify(
 
 ```{r}
 #| echo: false
-knitr::include_graphics("img/ml_algorithms.png")
+knitr::include_graphics("img/superlearner.png")
 ```
 
+::: tiny
+Image source: Sherri Rose
+:::
+
 ## Ensemble Algorithms with SuperLearner
 
 ```{r}
 #| echo: false
-knitr::include_graphics("img/superlearner.png")
+knitr::include_graphics("img/ml_algorithms.png")
 ```
 
 :::{.fragment}
@@ -414,12 +422,12 @@ Given a set of candidate algorithms (and hyperparameters), stacked ensembles com
 ## SuperLearner: Exposure Model
 
 ```{r}
-#| code-line-numbers: "|1,3,8|4-7|"
+#| code-line-numbers: "|1,3,10|4-7|"
 #| cache: true
-sl_library <- c("SL.glm", "SL.ranger", "SL.xgboost", "SL.gam")
+sl_library <- c("SL.glm", "SL.ranger", "SL.gam")
 
 propensity_sl <- SuperLearner(
-  Y = nhefs_complete_uc$qsmk |> as.integer(),
+  Y = as.integer(nhefs_complete_uc$qsmk == "Yes"),
   X = nhefs_complete_uc |> 
     select(sex, race, age, education, smokeintensity, 
            smokeyrs, exercise, active, wt71) |> 
@@ -460,7 +468,7 @@ outcome_sl <- SuperLearner(
 outcome_sl
 ```
 
-## *Your Turn 1*
+## *Your Turn 1* {.small}
 
 ```{r}
 #| echo: false
@@ -497,7 +505,7 @@ tidy(ipw_model)
 ## G-computation with SuperLearner
 
 ```{r}
-#| code-line-numbers: "|1-2,7|4-5,8|10"
+#| code-line-numbers: "|1,5,7,11|13-14|16"
 data_all_quit <- nhefs_complete_uc |>
   select(qsmk, sex, race, age, education, smokeintensity, 
            smokeyrs, exercise, active, wt71) |> 
@@ -542,17 +550,19 @@ countdown::countdown(minutes = 8)
 
 - In **IPW** and **G-computation**, we estimate the average treatment effect (ATE) using predictions from the exposure and outcome models. But these algorithms optimize for the predictions, not the ATE.
 - In **TMLE**, we adjust the predictions to specifically target the ATE. We change the bias-variance tradeoff to focus on the ATE rather than just minimizing prediction error. This is a debiasing step that also improves the efficiency of the estimate!
-- Targeting is a general technique that can be applied to many problems, not just causal ones
 
 ## Targeted Learning: valid statistical inference
-- In **IPW** and **G-computation**, we can using ML algorithms to make predictions, but we cannot easily get valid confidence intervals. Bootstrapping is often used, but it can be computationally intensive and not always valid.
+- In **IPW** and **G-computation**, we cannot easily get valid confidence intervals with ML. Bootstrapping is often used, but it can be computationally intensive and not always valid.
 - In **TMLE**, we can use the influence curve to get valid confidence intervals. The influence curve is a way to estimate the variance of the TMLE estimate, even when using complex ML algorithms.
 
 ## The TMLE Algorithm {background-color="#23373B"}
 
 1. Start with SuperLearner predictions for the outcome
 2. Calculate the propensity scores using SuperLearner
 3. Create the clever covariate using the propensity scores
+
+## The TMLE Algorithm {background-color="#23373B"}
+
 4. Fit the fluctuation model to learn how much to adjust the outcome predictions
 5. Update the predictions with the targeted adjustment
 6. Calculate the TMLE estimate and standard error using the influence curve
@@ -562,6 +572,7 @@ countdown::countdown(minutes = 8)
 ```{r}
 #| echo: true
 #| cache: true
+#| cache.lazy: false
 # For TMLE with continuous outcomes, fit SuperLearner on bounded Y
 min_y <- min(nhefs_complete_uc$wt82_71)
 max_y <- max(nhefs_complete_uc$wt82_71)
@@ -574,7 +585,7 @@ outcome_sl_bounded <- SuperLearner(
     select(qsmk, sex, race, age, education, smokeintensity, 
            smokeyrs, exercise, active, wt71) |> 
     mutate(across(everything(), as.numeric)),
-  family = binomial(),
+  family = quasibinomial(),
   SL.library = sl_library,
   cvControl = list(V = 5)
 )
@@ -588,7 +599,7 @@ initial_pred_no_quit <- predict(outcome_sl_bounded, newdata = data_all_no_quit)$
 
 # Predictions for observed treatment
 initial_pred_observed <- ifelse(
-  nhefs_complete_uc$qsmk == 1,
+  nhefs_complete_uc$qsmk == "Yes",
   initial_pred_quit,
   initial_pred_no_quit
 )
@@ -597,9 +608,9 @@ initial_pred_observed <- ifelse(
 ## TMLE Step 2: Clever Covariate
 
 ```{r}
-#| echo: true
+#| code-line-numbers: "|3-4"
 clever_covariate <- ifelse(
-  nhefs_complete_uc$qsmk == 1,
+  nhefs_complete_uc$qsmk == "Yes",
   1 / propensity_scores,
   -1 / (1 - propensity_scores)
 )
@@ -614,7 +625,8 @@ clever_covariate <- ifelse(
 ## TMLE Step 3: Targeting
 
 ```{r}
-#| echo: true
+#| code-line-numbers: "|4|5|6|7|8|11-12"
+#| output-location: fragment
 # Fluctuation model - learns how much to adjust
 # Use binomial family and work on logit scale
 fluctuation_model <- glm(
@@ -637,7 +649,7 @@ epsilon
 ## TMLE Step 4: Update Predictions
 
 ```{r}
-#| code-line-numbers: "|1-2|5-6"
+#| code-line-numbers: "|2-3|6-7"
 # Update predictions on logit scale, then transform back
 logit_pred_quit <- qlogis(initial_pred_quit) + epsilon * (1 / propensity_scores)
 logit_pred_no_quit <- qlogis(initial_pred_no_quit) + epsilon * (-1 / (1 - propensity_scores))
@@ -710,15 +722,15 @@ targeted_ate <- mean(
   targeted_pred_quit - targeted_pred_no_quit
 ) * (max_y - min_y)
 
-c(initial = initial_ate, targeted = targeted_ate)
+tibble(initial = initial_ate, targeted = targeted_ate)
 ```
 
 ## TMLE Inference
 
 ```{r}
-#| echo: true
+#| output-location: slide
 targeted_pred_observed <- ifelse(
-  nhefs_complete_uc$qsmk == 1,
+  nhefs_complete_uc$qsmk == "Yes",
   targeted_pred_quit,
   targeted_pred_no_quit
 )
@@ -743,12 +755,12 @@ tibble(
 
 ```{r}
 #| cache: true
-#| output-location: slide
+#| cache.lazy: false
 library(tmle)
 
 tmle_result <- tmle(
   Y = nhefs_complete_uc$wt82_71,
-  A = nhefs_complete_uc$qsmk |> as.integer(),
+  A = as.integer(nhefs_complete_uc$qsmk == "Yes"),
   W = nhefs_complete_uc |> 
     select(sex, race, age, education, smokeintensity, 
            smokeyrs, exercise, active, wt71) |> 
@@ -757,9 +769,14 @@ tmle_result <- tmle(
   g.SL.library = sl_library
 )
 
-summary(tmle_result)
+tibble(
+  ate = tmle_result$estimates$ATE$psi, 
+  lower_ci = tmle_result$estimates$ATE$CI[[1]],
+  upper_ci = tmle_result$estimates$ATE$CI[[2]]
+)
 ```
 
+
 ## *Your Turn 4*
 
 ```{r}
 
@@ -0,0 +1,25 @@
+tidyverse
+ggplot2
+tibble
+tidyr
+readr
+purrr
+dplyr
+stringr
+forcats
+lubridate
+broom
+causaldata
+touringplans
+propensity
+nnls
+foreach
+gam
+SuperLearner
+Matrix
+glmnet
+tmle
+yardstick
+ggdag
+ggokabeito
+patchwork