r-causal
diff --git a/‎exercises/03-ci-with-group-by-and-summarise-exercises.qmd‎
Lines changed: 119 additions & 0 deletions b/‎exercises/03-ci-with-group-by-and-summarise-exercises.qmd‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎slides/pdf/03-causal-inference-with-group-by-and-summarise.pdf‎
146 KB b/‎slides/pdf/03-causal-inference-with-group-by-and-summarise.pdf‎
146 KB
diff --git a/‎slides/raw/03-causal-inference-with-group-by-and-summarise.html‎
Lines changed: 65 additions & 53 deletions b/‎slides/raw/03-causal-inference-with-group-by-and-summarise.html‎
Lines changed: 65 additions & 53 deletions
diff --git a/‎slides/raw/03-causal-inference-with-group-by-and-summarise.qmd‎
Lines changed: 38 additions & 14 deletions b/‎slides/raw/03-causal-inference-with-group-by-and-summarise.qmd‎
Lines changed: 38 additions & 14 deletions
@@ -0,0 +1,119 @@
+---
+title: "Causal Inference with `group_by()` and `summarize()`"
+format: html
+---
+
+```{r}
+#| label: setup
+library(tidyverse)
+set.seed(1)
+```
+
+## Your Turn 1
+
+Run this code to generate the simulated data set
+
+```{r}
+n <- 1000 
+sim <- tibble(
+  confounder = rbinom(n, 1, 0.5),
+  p_exposure = case_when(
+    confounder == 1 ~ 0.75,
+    confounder == 0 ~ 0.25
+  ),
+  exposure = rbinom(n, 1, p_exposure),
+  outcome = confounder + rnorm(n)
+)
+```
+
+1. Group the dataset by `confounder` and `exposure`
+2. Calculate the mean of the `outcome` for the groups
+
+```{r}
+sim |>
+  group_by(______, ______) |>
+  summarise(avg_y = mean(______)) |>
+  # pivot the data so we can get the difference
+  # between the exposure groups
+  pivot_wider(
+    names_from = exposure,
+    values_from = avg_y,  
+    names_prefix = "x_"
+  ) |>
+  summarise(estimate = x_1 - x_0) |>
+  summarise(estimate = mean(estimate)) # note, we would need to weight this if the confounder groups were not equal sized
+```
+
+## Your Turn 2
+
+Run the following code to generate `sim2`
+
+```{r}
+n <- 1000
+sim2 <- tibble(
+  confounder_1 = rbinom(n, 1, 0.5),
+  confounder_2 = rbinom(n, 1, 0.5), 
+  
+  p_exposure = case_when(
+    confounder_1 == 1 & confounder_2 == 1 ~ 0.75,
+    confounder_1 == 0 & confounder_2 == 1 ~ 0.9,
+    confounder_1 == 1 & confounder_2 == 0 ~ 0.2,
+    confounder_1 == 0 & confounder_2 == 0 ~ 0.1,
+  ),
+  exposure = rbinom(n, 1, p_exposure),
+  outcome = confounder_1 + confounder_2 + rnorm(n) 
+)
+```
+
+1. Group the dataset by the confounders and exposure
+2. Calculate the mean of the outcome for the groups
+
+```{r}
+sim2 |>
+  group_by(_____, _____, _____) |>
+  summarise(avg_y = mean(_____)) |>
+  pivot_wider(names_from = exposure,
+              values_from = avg_y,  
+              names_prefix = "x_") |>
+  summarise(estimate = x_1 - x_0, .groups = "drop") |>
+  summarise(estimate = mean(estimate))  
+
+```
+
+## Your Turn 3
+
+Run the following code to generate `sim3`
+
+```{r}
+n <- 10000 
+sim3 <- tibble(
+  confounder = rnorm(n), 
+  p_exposure = exp(confounder) / (1 + exp(confounder)),
+  exposure = rbinom(n, 1, p_exposure),
+  outcome = confounder + rnorm(n) 
+)
+```
+
+1. Use `ntile()` from dplyr to calculate a binned version of `confounder` called `confounder_q`. We'll create a variable with 5 bins.
+2. Group the dataset by the binned variable you just created and exposure
+3. Calculate the mean of the outcome for the groups
+
+```{r}
+sim3 |>
+  mutate(confounder_q = _____(_____, 5)) |>
+  group_by(_____, _____) |>
+  summarise(avg_y = mean(_____)) |>
+  pivot_wider(
+    names_from = exposure,
+    values_from = avg_y,  
+    names_prefix = "x_"
+  ) |>
+  summarise(estimate = x_1 - x_0) 
+
+```
+
+# Take aways
+
+* Sometimes correlation *is* causation!
+* In simple cases, grouping by confounding variables can get us the right answer without a statistical model
+* Propensity scores generalize the idea of summarizing exposure effects to any number of confounders. Although we'll use models for this process, the foundations are the same.
@@ -124,7 +124,14 @@ sim |>
   summarise(estimate = x_1 - x_0) 
 ```
 
-## Simulation
+## *Your Turn 1* (`03-ci-with-group-by-and-summarise-exercises.qmd`)
+
+### Group the dataset by `confounder` and `exposure`
+### Calculate the mean of the `outcome` for the groups
+
+`r countdown::countdown(minutes = 3)`
+
+## *Your Turn 1*
 
 ```{r}
 #| code-line-numbers: "|2"
@@ -134,7 +141,7 @@ sim |>
   summarise(avg_y = mean(outcome))
 ```
 
-## Simulation
+## *Your Turn 1*
 
 ```{r}
 #| code-line-numbers: "|2"
@@ -147,7 +154,8 @@ sim |>
     values_from = avg_y,  
     names_prefix = "x_"
   ) |>
-  summarise(estimate = x_1 - x_0) 
+  summarise(estimate = x_1 - x_0) |>
+  summarise(estimate = mean(estimate)) # note, we would need to weight this if the confounder groups were not equal sized
 ```
 
 . . .
@@ -196,7 +204,12 @@ sim2 |>
 lm(outcome ~ exposure, data = sim2)
 ```
 
-## Simulation
+## *Your Turn 2*
+
+### Group the dataset by the confounders and exposure
+### Calculate the mean of the outcome for the groups
+
+## *Your Turn 2*
 
 ```{r}
 #| code-line-numbers: "|2"
@@ -209,10 +222,11 @@ sim2 |>
     values_from = avg_y,  
     names_prefix = "x_"
   ) |>
-  summarise(estimate = x_1 - x_0) 
+  summarise(estimate = x_1 - x_0, .groups = "drop") |>
+  summarise(estimate = mean(estimate)) 
 ```
 
----
+`r countdown::countdown(minutes = 2)`
 
 ## Simulation
 
@@ -222,7 +236,7 @@ sim2 |>
 ```{r}
 #| code-line-numbers: "|1"
 n <- 100000 
-sim2 <- tibble(
+big_sim2 <- tibble(
   confounder_1 = rbinom(n, 1, 0.5),
   confounder_2 = rbinom(n, 1, 0.5), 
   
@@ -241,7 +255,7 @@ sim2 <- tibble(
 ::: {.column width="50%"}
 ```{r}
 #| echo: false
-sim2 |>
+big_sim2 |>
   select(confounder_1, confounder_2, exposure, outcome)
 ```
 :::
@@ -251,21 +265,22 @@ sim2 |>
 ## Simulation
 
 ```{r}
-lm(outcome ~ exposure, data = sim2)
+lm(outcome ~ exposure, data = big_sim2)
 ```
 
 ## Simulation
 
 ```{r}
 #| code-line-numbers: "|2"
 #| output-location: fragment
-sim2 |>
+big_sim2 |>
   group_by(confounder_1, confounder_2, exposure) |>
   summarise(avg_y = mean(outcome)) |>
   pivot_wider(names_from = exposure,
               values_from = avg_y,  
               names_prefix = "x_") |>
-  summarise(estimate = x_1 - x_0) 
+  summarise(estimate = x_1 - x_0, .groups = "drop") |>
+  summarise(estimate = mean(estimate))  
 ```
 
 
@@ -305,10 +320,18 @@ sim3 |>
 lm(outcome ~ exposure, data = sim3)
 ```
 
-## Simulation
+## *Your Turn 3*
+
+### Use `ntile()` from dplyr to calculate a binned version of `confounder` called `confounder_q`. We'll create a variable with 5 bins.
+### Group the dataset by the binned variable you just created and exposure
+### Calculate the mean of the outcome for the groups
+
+`r countdown::countdown(minutes = 3)`
+
+## *Your Turn 3*
 
 ```{r}
-#| code-line-numbers: "|2"
+#| code-line-numbers: "|2|3-4"
 #| output-location: fragment
 sim3 |>
   mutate(confounder_q = ntile(confounder, 5)) |>
@@ -319,7 +342,8 @@ sim3 |>
     values_from = avg_y,  
     names_prefix = "x_"
   ) |>
-  summarise(estimate = x_1 - x_0) 
+  summarise(estimate = x_1 - x_0) |>
+  summarise(estimate = mean(estimate))
 ```
 
 ## {background-color="#23373B" .center .huge}