Merge pull request #63 from r-causal/touringplans

malcolmbarrett · web-flow · commit f809dac9c77f · 2023-08-31T15:44:56.000+01:00
diff --git a/exercises/06-intro-pscores-exercises.qmd b/exercises/06-intro-pscores-exercises.qmd
@@ -80,7 +80,7 @@ dagify(
       "\n(one year ago)",
       "\n(6 months ago)",
       "\n(3 months ago)",
-      "5pm - 6pm\n(Today)"
+      "9am - 10am\n(Today)"
     )
   )
 ```
@@ -93,24 +93,24 @@ First we need to subset the data to only include average wait times between 9 an
 
 ```{r}
 seven_dwarfs <- seven_dwarfs_train_2018 |>
-  filter(hour == 9)
+  filter(wait_hour == 9)
 ```
 
 Here's a data dictionary of the variables we need in the `seven_dwarfs` data set:
 
 | Variable                       | Column in `seven_dwarfs` |
 |--------------------------------|--------------------------|
-| Posted Wait Time (outcome)     | `avg_spostmin`           |
-| Extra Magic Morning (exposure) | `extra_magic_morning`    |
-| Ticket Season                  | `wdw_ticket_season`      |
-| Closing Time                   | `close`                  |
-| Historic Temperature           | `weather_wdwhigh`        |
+| Posted Wait Time (outcome)     | `wait_minutes_posted_avg`|
+| Extra Magic Morning (exposure) | `park_extra_magic_morning`    |
+| Ticket Season                  | `park_ticket_season`      |
+| Closing Time                   | `park_close`                  |
+| Historic Temperature           | `park_temperature_high`        |
 
 ## Your Turn
 
 *After updating the code chunks below, change `eval: true` before rendering*
 
-Now, fit a propensity score model for `extra_magic_morning` using the above proposed confounders.
+Now, fit a propensity score model for `park_extra_magic_morning` using the above proposed confounders.
 
 ```{r}
 #| eval: false
@@ -131,7 +131,7 @@ df <- propensity_model |>
 
 Stretch Goal 1:
 
-Examine two histograms of the propensity scores, one days with Extra Magic Morning (`extra_magic_morning == 1`) and one for days without it (`extra_magic_morning == 0`).
+Examine two histograms of the propensity scores, one days with Extra Magic Morning (`park_extra_magic_morning == 1`) and one for days without it (`park_extra_magic_morning == 0`).
 How do these compare?
 
 ```{r}
diff --git a/exercises/07-pscores-using-exercises.qmd b/exercises/07-pscores-using-exercises.qmd
@@ -18,10 +18,10 @@ Below is the propensity score model you created in the previous exercise.
 
 ```{r}
 seven_dwarfs <- seven_dwarfs_train_2018 |>
-  filter(hour == 9)
+  filter(wait_hour == 9)
 
 propensity_model <- glm(
-  extra_magic_morning ~ wdw_ticket_season + close + weather_wdwhigh,
+  park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high,
   data = seven_dwarfs,
   family = binomial()
 )
@@ -82,7 +82,7 @@ Update the code below to examine the distribution of the weighted sample. **HINT
 #| warning: false
 ggplot(
   seven_dwarfs_prop, 
-  aes(.fitted, fill = factor(extra_magic_morning))
+  aes(.fitted, fill = factor(park_extra_magic_morning))
 ) +
   geom_mirror_histogram(bins = 50, alpha = .5) +
   geom_mirror_histogram(aes(weight = ____), alpha = .5, bins = 50) +
diff --git a/exercises/08-pscores-diagnostics-exercises.qmd b/exercises/08-pscores-diagnostics-exercises.qmd
@@ -19,17 +19,17 @@ Below is the propensity score model and weights you created in the previous exer
 
 ```{r}
 seven_dwarfs <- seven_dwarfs_train_2018 |>
-  filter(hour == 9)
+  filter(wait_hour == 9)
 
 propensity_model <- glm(
-  extra_magic_morning ~ wdw_ticket_season + close + weather_wdwhigh,
+  park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high,
   data = seven_dwarfs,
   family = binomial()
 )
 
 seven_dwarfs_ps <- propensity_model |>
   augment(type.predict = "response", data = seven_dwarfs) |>
-  mutate(w_ate = wt_ate(.fitted, extra_magic_morning))
+  mutate(w_ate = wt_ate(.fitted, park_extra_magic_morning))
 ```
 
 ## Your Turn 1
@@ -41,7 +41,7 @@ Calculate the standardized mean differences with and without weights
 ```{r}
 #| eval: false
 smds <- seven_dwarfs_ps |>
-  mutate(close = as.numeric(close)) |>
+  mutate(park_close = as.numeric(park_close)) |>
   tidy_smd(
   .vars = ____,
   .group = ____,
@@ -62,7 +62,7 @@ ggplot(
 
 ## Your Turn 2
 
-Create an unweighted ECDF for `weather_wdwhigh` by whether or not the day had Extra Magic Hours.
+Create an unweighted ECDF for `park_temperature_high` by whether or not the day had Extra Magic Hours.
 
 ```{r}
 #| eval: false
@@ -77,7 +77,7 @@ ggplot(seven_dwarfs_ps, aes(x = ____, group = ____, color = factor(____))) +
   ylab("Proportion <= x") 
 ```
 
-Create an weighted ECDF for `weather_wdwhigh` by whether or not the day had Extra Magic Hours.
+Create an weighted ECDF for `park_temperature_high` by whether or not the day had Extra Magic Hours.
 
 ```{r}
 #| eval: false
diff --git a/exercises/09-outcome-model-exercises.qmd b/exercises/09-outcome-model-exercises.qmd
@@ -13,7 +13,7 @@ library(rsample)
 library(propensity)
 
 seven_dwarfs <- seven_dwarfs_train_2018 |>
-  filter(hour == 9)
+  filter(wait_hour == 9)
 ```
 
 We are interested in examining the relationship between whether there were "Extra Magic Hours" in the morning (the **exposure**) and the average wait time for the Seven Dwarfs Mine Train the same day between 9am and 10am (the **outcome**).
@@ -57,9 +57,9 @@ ipw_results |>
   mutate(
     estimate = map_dbl(
       boot_fits,
-      # pull the `estimate` for `extra_magic_morning` for each fit
+      # pull the `estimate` for `park_extra_magic_morning` for each fit
       \(.fit) .fit |>
-        filter(term == "extra_magic_morning") |>
+        filter(term == "park_extra_magic_morning") |>
         pull(estimate)
     )
   ) |>
diff --git a/exercises/10-continuous-g-computation-exercises.qmd b/exercises/10-continuous-g-computation-exercises.qmd
@@ -13,7 +13,7 @@ library(splines)
 
 For this set of exercises, we'll use g-computation to calculate a causal effect for continuous exposures.
 
-In the touringplans data set, we have information about the posted waiting times for rides. We also have a limited amount of data on the observed, actual times. The question that we will consider is this: Do posted wait times (`avg_spostmin`) for the Seven Dwarves Mine Train at 8 am affect actual wait times (`avg_sactmin`) at 9 am? Here’s our DAG:
+In the touringplans data set, we have information about the posted waiting times for rides. We also have a limited amount of data on the observed, actual times. The question that we will consider is this: Do posted wait times (`wait_minutes_posted_avg`) for the Seven Dwarves Mine Train at 8 am affect actual wait times (`wait_minutes_actual_avg`) at 9 am? Here’s our DAG:
 
 ```{r}
 #| echo: false
@@ -83,29 +83,29 @@ dagify(
   )
 ```
 
-First, let’s wrangle our data to address our question: do posted wait times at 8 affect actual weight times at 9? We’ll join the baseline data (all covariates and posted wait time at 8) with the outcome (average actual time). We also have a lot of missingness for `avg_sactmin`, so we’ll drop unobserved values for now.
+First, let’s wrangle our data to address our question: do posted wait times at 8 affect actual weight times at 9? We’ll join the baseline data (all covariates and posted wait time at 8) with the outcome (average actual time). We also have a lot of missingness for `wait_minutes_actual_avg`, so we’ll drop unobserved values for now.
 
 You don't need to update any code here, so just run this.
 
 ```{r}
 eight <- seven_dwarfs_train_2018 |>
-  filter(hour == 8) |>
-  select(-avg_sactmin)
+  filter(wait_hour == 8) |>
+  select(-wait_minutes_actual_avg)
 
 nine <- seven_dwarfs_train_2018 |>
-  filter(hour == 9) |>
-  select(date, avg_sactmin)
+  filter(wait_hour == 9) |>
+  select(park_date, wait_minutes_actual_avg)
 
 wait_times <- eight |>
-  left_join(nine, by = "date") |>
-  drop_na(avg_sactmin)
+  left_join(nine, by = "park_date") |>
+  drop_na(wait_minutes_actual_avg)
 ```
 
 # Your Turn 1
 
-For the parametric G-formula, we'll use a single model to fit a causal model of Posted Waiting Times (`avg_spostmin`) on Actual Waiting Times (`avg_sactmin`) where we  include all covariates, much as we normally fit regression models. However, instead of interpreting the coefficients, we'll calculate the estimate by predicting on cloned data sets.
+For the parametric G-formula, we'll use a single model to fit a causal model of Posted Waiting Times (`wait_minutes_posted_avg`) on Actual Waiting Times (`wait_minutes_actual_avg`) where we  include all covariates, much as we normally fit regression models. However, instead of interpreting the coefficients, we'll calculate the estimate by predicting on cloned data sets.
 
-Two additional differences in our model: we'll use a natural cubic spline on the exposure, `avg_spostmin`, using `ns()` from the splines package, and we'll include an interaction term between `avg_spostmin` and `extra_magic_mornin g`. These complicate the interpretation of the coefficient of the model in normal regression but have virtually no downside (as long as we have a reasonable sample size) in g-computation, because we still get an easily interpretable result.
+Two additional differences in our model: we'll use a natural cubic spline on the exposure, `wait_minutes_posted_avg`, using `ns()` from the splines package, and we'll include an interaction term between `wait_minutes_posted_avg` and `park_extra_magic_morning`. These complicate the interpretation of the coefficient of the model in normal regression but have virtually no downside (as long as we have a reasonable sample size) in g-computation, because we still get an easily interpretable result.
 
 First, let's fit the model. 
 
@@ -114,14 +114,14 @@ First, let's fit the model.
 
 ```{r}
 _______ ___ _______(
-  avg_sactmin ~ ns(_______, df = 5)*extra_magic_morning + _______ + _______ + _______, 
+  wait_minutes_actual_avg ~ ns(_______, df = 5)*park_extra_magic_morning + _______ + _______ + _______, 
   data = seven_dwarfs
 )
 ```
 
 # Your Turn 2
 
-Now that we've fit a model, we need to clone our data set. To do this, we'll simply mutate it so that in one set, all participants have `avg_spostmin` set to 30 minutes and in another, all participants have `avg_spostmin` set to 60 minutes. 
+Now that we've fit a model, we need to clone our data set. To do this, we'll simply mutate it so that in one set, all participants have `wait_minutes_posted_avg` set to 30 minutes and in another, all participants have `wait_minutes_posted_avg` set to 60 minutes. 
 
 1. Create the cloned data sets, called `thirty` and `sixty`.
 2. For both data sets, use `standardized_model` and `augment()` to get the predicted values. Use the `newdata` argument in `augment()` with the relevant cloned data set. Then, select only the fitted value. Rename `.fitted` to either `thirty_posted_minutes` or `sixty_posted_minutes` (use the pattern `select(new_name = old_name)`).
diff --git a/exercises/14-bonus-continuous-pscores-exercises.qmd b/exercises/14-bonus-continuous-pscores-exercises.qmd
@@ -13,7 +13,7 @@ library(propensity)
 
 For this set of exercises, we'll use propensity scores for continuous exposures.
 
-In the touringplans data set, we have information about the posted waiting times for rides. We also have a limited amount of data on the observed, actual times. The question that we will consider is this: Do posted wait times (`avg_spostmin`) for the Seven Dwarves Mine Train at 8 am affect actual wait times (`avg_sactmin`) at 9 am? Here’s our DAG:
+In the touringplans data set, we have information about the posted waiting times for rides. We also have a limited amount of data on the observed, actual times. The question that we will consider is this: Do posted wait times (`wait_minutes_posted_avg`) for the Seven Dwarves Mine Train at 8 am affect actual wait times (`wait_minutes_actual_avg`) at 9 am? Here’s our DAG:
 
 ```{r}
 #| echo: false
@@ -83,31 +83,31 @@ dagify(
   )
 ```
 
-First, let’s wrangle our data to address our question: do posted wait times at 8 affect actual weight times at 9? We’ll join the baseline data (all covariates and posted wait time at 8) with the outcome (average actual time). We also have a lot of missingness for `avg_sactmin`, so we’ll drop unobserved values for now.
+First, let’s wrangle our data to address our question: do posted wait times at 8 affect actual weight times at 9? We’ll join the baseline data (all covariates and posted wait time at 8) with the outcome (average actual time). We also have a lot of missingness for `wait_minutes_actual_avg`, so we’ll drop unobserved values for now.
 
 You don't need to update any code here, so just run this.
 
 ```{r}
 eight <- seven_dwarfs_train_2018 |>
-  filter(hour == 8) |>
-  select(-avg_sactmin)
+  filter(wait_hour == 8) |>
+  select(-wait_minutes_actual_avg)
 
 nine <- seven_dwarfs_train_2018 |>
-  filter(hour == 9) |>
-  select(date, avg_sactmin)
+  filter(wait_hour == 9) |>
+  select(park_date, wait_minutes_actual_avg)
 
 wait_times <- eight |>
-  left_join(nine, by = "date") |>
-  drop_na(avg_sactmin)
+  left_join(nine, by = "park_date") |>
+  drop_na(wait_minutes_actual_avg)
 ```
 
 # Your Turn 1
 
-First, let’s calculate the propensity score model, which will be the denominator in our stabilized weights (more to come on that soon). We’ll fit a model using `lm()` for `avg_spostmin` with our covariates, then use the fitted predictions of `avg_spostmin` (`.fitted`, `.sigma`) to calculate the density using `dnorm()`.
+First, let’s calculate the propensity score model, which will be the denominator in our stabilized weights (more to come on that soon). We’ll fit a model using `lm()` for `wait_minutes_posted_avg` with our covariates, then use the fitted predictions of `wait_minutes_posted_avg` (`.fitted`, `.sigma`) to calculate the density using `dnorm()`.
 
-1. Fit a model using `lm()` with `avg_spostmin` as the outcome and the confounders identified in the DAG.
+1. Fit a model using `lm()` with `wait_minutes_posted_avg` as the outcome and the confounders identified in the DAG.
 2. Use `augment()` to add model predictions to the data frame.
-3. In `wt_ate()`, calculate the weights using `avg_postmin`, `.fitted`, and `.sigma`.
+3. In `wt_ate()`, calculate the weights using `wait_minutes_posted_avg`, `.fitted`, and `.sigma`.
 
 ```{r}
 post_time_model <- lm(
@@ -169,7 +169,7 @@ Now, let's fit the outcome model!
 ```{r}
 lm(___ ~ ___, weights = ___, data = wait_times_swts) |>
   tidy() |>
-  filter(term == "avg_spostmin") |>
+  filter(term == "wait_minutes_posted_avg") |>
   mutate(estimate = estimate * 10)
 ```
 
diff --git a/slides/raw/06-pscores.qmd b/slides/raw/06-pscores.qmd
@@ -181,7 +181,7 @@ dagify(
       "\n(one year ago)",
       "\n(6 months ago)",
       "\n(3 months ago)",
-      "5pm - 6pm\n(Today)"
+      "9am - 10am\n(Today)"
     )
   )
 ```
@@ -190,7 +190,7 @@ dagify(
 
 `r countdown::countdown(minutes = 10)`
 
-### Using the **confounders** identified in the previous DAG, fit a propensity score model for `extra_magic_morning`
+### Using the **confounders** identified in the previous DAG, fit a propensity score model for `park_extra_magic_morning`
 ### *Stretch*: Create two histograms, one of the propensity scores for days with extra morning magic hours and one for those without
 
 
diff --git a/slides/raw/08-pscore-diagnostics.qmd b/slides/raw/08-pscore-diagnostics.qmd
@@ -178,8 +178,8 @@ ggplot(df, aes(x = wt71, color = factor(qsmk))) +
 
 `r countdown::countdown(minutes = 10)`
 
-### Create an unweighted ECDF examining the `weather_wdwhigh` confounder by whether or not the day had Extra Magic Hours.
-### Create a weighted ECDF examining the `weather_wdwhigh` confounder
+### Create an unweighted ECDF examining the `park_temperature_high` confounder by whether or not the day had Extra Magic Hours.
+### Create a weighted ECDF examining the `park_temperature_high` confounder
 
 
 ## {background-color="#23373B" .center .huge}
diff --git a/slides/raw/09-outcome-model.qmd b/slides/raw/09-outcome-model.qmd
@@ -143,7 +143,7 @@ boot_estimate <- int_t(ipw_results, boot_fits) |>
 
 `r countdown::countdown(minutes = 12)`
 
-### Create a function called `ipw_fit` that fits the propensity score model and the weighted outcome model for the effect between `extra_magic_morning` and `avg_spostmin`
+### Create a function called `ipw_fit` that fits the propensity score model and the weighted outcome model for the effect between `park_extra_magic_morning` and `wait_minutes_posted_avg`
 
 ### Using the `bootstraps()` and `int_t()` functions to estimate the final effect.
 
diff --git a/slides/raw/14-bonus-continuous-pscores.qmd b/slides/raw/14-bonus-continuous-pscores.qmd
@@ -192,9 +192,9 @@ dagify(
 
 ## *Your Turn 1*
 
-### Fit a model using `lm()` with `avg_spostmin` as the outcome and the confounders identified in the DAG.
+### Fit a model using `lm()` with `wait_minutes_posted_avg` as the outcome and the confounders identified in the DAG.
 ### Use `augment()` to add model predictions to the data frame
-### In `wt_ate()`, calculate the weights using `avg_postmin`, `.fitted`, and `.sigma`
+### In `wt_ate()`, calculate the weights using `wait_minutes_posted_avg`, `.fitted`, and `.sigma`
 
 `r countdown::countdown(minutes = 5)`
 
@@ -203,23 +203,23 @@ dagify(
 ```{r}
 #| include: false
 eight <- seven_dwarfs_train_2018 |>
-  filter(hour == 8) |>
-  select(-avg_sactmin)
+  filter(wait_hour == 8) |>
+  select(-wait_minutes_posted_avg)
 
 nine <- seven_dwarfs_train_2018 |>
-  filter(hour == 9) |>
-  select(date, avg_sactmin)
+  filter(wait_hour == 9) |>
+  select(park_date, wait_minutes_posted_avg)
 
 wait_times <- eight |>
-  left_join(nine, by = "date") |>
-  drop_na(avg_sactmin)
+  left_join(nine, by = "park_date") |>
+  drop_na(wait_minutes_posted_avg)
 ```
 
 ```{r}
 post_time_model <- lm(
-  avg_spostmin ~
-    close + extra_magic_morning + 
-    weather_wdwhigh + wdw_ticket_season, 
+  wait_minutes_posted_avg ~
+    park_close + park_extra_magic_morning + 
+    park_temperature_high + park_ticket_season, 
   data = wait_times
 )
 ```
@@ -230,7 +230,7 @@ post_time_model <- lm(
 wait_times_wts <- post_time_model |>
   augment(data = wait_times) |>
   mutate(wts = wt_ate(
-    avg_spostmin, .fitted, .sigma = .sigma
+    wait_minutes_posted_avg, .fitted, .sigma = .sigma
   ))
 ```
 
@@ -289,7 +289,7 @@ ggplot(nhefs_swts, aes(swts)) +
 wait_times_swts <- post_time_model |>
   augment(data = wait_times) |>
   mutate(swts = wt_ate(
-    avg_spostmin, 
+    wait_minutes_posted_avg, 
     .fitted,
     .sigma = .sigma,
     stabilize = TRUE
@@ -324,12 +324,12 @@ lm(
 
 ```{r}
 lm(
-  avg_sactmin ~ avg_spostmin, 
+  wait_minutes_actual_avg ~ wait_minutes_posted_avg, 
   weights = swts, 
   data = wait_times_swts
 ) |>
   tidy() |>
-  filter(term == "avg_spostmin") |>
+  filter(term == "wait_minutes_posted_avg") |>
   mutate(estimate = estimate * 10)
 ```
 

Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,7 @@ dagify(`
`181`	`181`	`"\n(one year ago)",`
`182`	`182`	`"\n(6 months ago)",`
`183`	`183`	`"\n(3 months ago)",`
`184`		`- "5pm - 6pm\n(Today)"`
	`184`	`+ "9am - 10am\n(Today)"`
`185`	`185`	`)`
`186`	`186`	`)`
`187`	`187`	```
`@@ -190,7 +190,7 @@ dagify(`
`190`	`190`
`191`	`191`	`r countdown::countdown(minutes = 10)`
`192`	`192`
`193`		-### Using the confounders identified in the previous DAG, fit a propensity score model for `extra_magic_morning`
	`193`	+### Using the confounders identified in the previous DAG, fit a propensity score model for `park_extra_magic_morning`
`194`	`194`	`### Stretch: Create two histograms, one of the propensity scores for days with extra morning magic hours and one for those without`
`195`	`195`
`196`	`196`