add merge and reshape

ljwright · ljwright · commit 01c132a54678 · 2024-05-14T09:19:51.000+01:00
diff --git a/docs/mcs-household_grid.md b/docs/mcs-household_grid.md
@@ -199,21 +199,21 @@ df_0y_hhgrid_prel <- read_dta("0y/mcs1_hhgrid.dta") %>%
   select(MCSID, APNUM00, matches("AHPREL[A-Z]0"))
 
 df_0y_hhgrid_prel %>%
-  select(MCSID, APNUM00, AHPRELA0, AHPRELB0, AHPRELC0, AHPRELD0) %>%
-  filter(MCSID == "M10001N") # To look at just one family
+  filter(MCSID == "M10001N") %>% # To look at just one family
+  select(APNUM00, AHPRELA0, AHPRELB0, AHPRELC0)
 ```
 
 ``` text
-# A tibble: 7 × 6
-  MCSID   APNUM00 AHPRELA0                  AHPRELB0           AHPRELC0 AHPRELD0
-  <chr>     <dbl> <dbl+lbl>                 <dbl+lbl>          <dbl+lb> <dbl+lb>
-1 M10001N       1 96 [Self]                  1 [Husband/Wife]   7 [Nat…  7 [Nat…
-2 M10001N       2  1 [Husband/Wife]         96 [Self]           7 [Nat…  7 [Nat…
-3 M10001N       3  3 [Natural son/daughter]  3 [Natural son/d… 96 [Sel… 11 [Nat…
-4 M10001N       4  3 [Natural son/daughter]  3 [Natural son/d… 11 [Nat… 96 [Sel…
-5 M10001N       5  3 [Natural son/daughter]  3 [Natural son/d… 11 [Nat… 11 [Nat…
-6 M10001N       6  3 [Natural son/daughter]  3 [Natural son/d… 11 [Nat… 11 [Nat…
-7 M10001N     100  3 [Natural son/daughter]  3 [Natural son/d… 11 [Nat… 11 [Nat…
+# A tibble: 7 × 4
+  APNUM00 AHPRELA0                  AHPRELB0                  AHPRELC0          
+    <dbl> <dbl+lbl>                 <dbl+lbl>                 <dbl+lbl>         
+1       1 96 [Self]                  1 [Husband/Wife]          7 [Natural paren…
+2       2  1 [Husband/Wife]         96 [Self]                  7 [Natural paren…
+3       3  3 [Natural son/daughter]  3 [Natural son/daughter] 96 [Self]         
+4       4  3 [Natural son/daughter]  3 [Natural son/daughter] 11 [Natural broth…
+5       5  3 [Natural son/daughter]  3 [Natural son/daughter] 11 [Natural broth…
+6       6  3 [Natural son/daughter]  3 [Natural son/daughter] 11 [Natural broth…
+7     100  3 [Natural son/daughter]  3 [Natural son/daughter] 11 [Natural broth…
 ```
 
 There are seven members in this family, one of whom is a cohort member
@@ -238,23 +238,23 @@ df_0y_hhgrid_prel %>%
                values_to = "relationship") %>%
   mutate(APNUM00_alt = match(str_sub(alt, 7, 7), LETTERS)) %>%
   filter(relationship == 1) %>%
-  select(MCSID, APNUM00, parent_pnum = APNUM00_alt)
+  select(MCSID, APNUM00, partner_pnum = APNUM00_alt)
 ```
 
 ``` text
 # A tibble: 23,616 × 3
-   MCSID   APNUM00 parent_pnum
-   <chr>     <dbl>       <int>
- 1 M10001N       1           2
- 2 M10001N       2           1
- 3 M10002P       1           2
- 4 M10002P       2           1
- 5 M10007U       1           2
- 6 M10007U       2           1
- 7 M10011Q       1           2
- 8 M10011Q       2           1
- 9 M10015U       1           2
-10 M10015U       2           1
+   MCSID   APNUM00 partner_pnum
+   <chr>     <dbl>        <int>
+ 1 M10001N       1            2
+ 2 M10001N       2            1
+ 3 M10002P       1            2
+ 4 M10002P       2            1
+ 5 M10007U       1            2
+ 6 M10007U       2            1
+ 7 M10011Q       1            2
+ 8 M10011Q       2            1
+ 9 M10015U       1            2
+10 M10015U       2            1
 # ℹ 23,606 more rows
 ```
 
diff --git a/quarto/mcs-household_grid.qmd b/quarto/mcs-household_grid.qmd
@@ -84,8 +84,8 @@ df_0y_hhgrid_prel <- read_dta("0y/mcs1_hhgrid.dta") %>%
   select(MCSID, APNUM00, matches("AHPREL[A-Z]0"))
 
 df_0y_hhgrid_prel %>%
-  select(MCSID, APNUM00, AHPRELA0, AHPRELB0, AHPRELC0, AHPRELD0) %>%
-  filter(MCSID == "M10001N") # To look at just one family
+  filter(MCSID == "M10001N") %>% # To look at just one family
+  select(APNUM00, AHPRELA0, AHPRELB0, AHPRELC0)
 ```
 
 There are seven members in this family, one of whom is a cohort member (`APNUM00 == 100`). `APNUM00`'s 1 and 2 are the (natural) parents, and `APNUM00`'s 3-6 and 100 are the (natural) children. The relationship variables show that `APNUM00`'s 1 and 2 are married, and `APNUM00`'s 3-7 are siblings. Note, the symmetry in the relationships. Where, `APNUM00 == 1`, `AHPRELC0 == 7 [Natural Parent]` and where `APNUM00 == 3`, `AHPRELA0 == 3 [Natural Child]`.
@@ -99,7 +99,7 @@ df_0y_hhgrid_prel %>%
                values_to = "relationship") %>%
   mutate(APNUM00_alt = match(str_sub(alt, 7, 7), LETTERS)) %>%
   filter(relationship == 1) %>%
-  select(MCSID, APNUM00, parent_pnum = APNUM00_alt)
+  select(MCSID, APNUM00, partner_pnum = APNUM00_alt)
 ```
 
 # Coda
diff --git a/quarto/mcs-reshape_long_wide.qmd b/quarto/mcs-reshape_long_wide.qmd
@@ -0,0 +1,105 @@
+---
+layout: default
+title: "Reshaping Data from Long to Wide (or Wide to Long)"
+nav_order: 4
+parent: MCS
+format: docusaurus-md
+---
+
+# Introduction
+
+In this tutorial, we will learn how to reshape data from long to wide (and vice versa) using the `tidyverse` package in `R`. We will use data on cohort member's height and weight collected in Sweeps 2-7 to demonstrate the process.
+
+```{r}
+#| warning: false
+# Load Packages
+library(tidyverse) # For data manipulation
+library(haven) # For importing .dta files
+library(glue) # For creating strings
+```
+
+```{r}
+#| include: false
+# setwd(Sys.getenv("mcs_fld"))
+```
+
+# Reshaping from Wide to Long
+
+We begin by loading the data from each sweep and merging these together into a single wide format data frame; see [Combining Data Across Sweeps](https://cls-data.github.io/docs/mcs-merging_across_sweeps.html) for more details. Note, the names of the height and weight variables in Sweep 5 (`ECHTCMA0` and `ECWTCMAO`) diverge slightly from the rubric used for other sweeps (`[A-G]CHTCM00` and `[A-G]CWTCM00` where `[A-G]` denotes sweep), hence the need for the complex regular expression in `read_dta(col_select = ...)` function call. To simplify the names of the columns in the wide dataset, we rename the Sweep 5 variables so they follow the rubric for Sweeps 2-4 and 6-7.
+
+```{r}
+fups <- c(0, 3, 5, 7, 11, 14, 17)
+
+load_height_wide <- function(sweep){
+  fup <- fups[sweep]
+  prefix <- LETTERS[sweep]
+  
+  glue("{fup}y/mcs{sweep}_cm_interview.dta") %>%
+    read_dta(col_select = c("MCSID", matches("^.(CNUM00|CHTCM(A|0)0)"))) %>%
+    rename(cnum = matches("CNUM00"))
+}
+
+df_wide <- map(2:7, load_height_wide) %>%
+  reduce(~ full_join(.x, .y, by = c("MCSID", "cnum"))) %>%
+  rename(ECHTCM00 = ECHTCMA0, ECWTCMA00 = ECWTCMA0)
+
+str(df_wide)
+```
+
+`df_wide` has 14 columns. Besides, the identifiers, `MCSID` and `cnum`, there are 12 columns for height and weight measurements at each sweep. Each of these 12 columns is prefixed by a single letter indicating the sweep. We can reshape the dataset into long format (one row per person x sweep combination) using the `pivot_longer()` function so that the resulting data frame has five columns: two person identifiers, a variable for sweep, and variables for height and weight. We specify the columns to be reshaped using the `cols` argument, provide the new variable names in the `names_to` argument, and the pattern the existing column names take using the `names_pattern` argument. For `names_pattern` we specify `"(.)(.*)"`, which breaks the column name into two pieces: the first character (`"(.)"`) and the rest of the name (`"(.*)"`). As noted, the first character holds information on sweep. In `names_to`, `.value` is a placeholder for the second piece of the column name.
+
+```{r}
+df_long <- df_wide %>%
+  pivot_longer(cols = matches("C(H|W)TCM00"),
+               names_to = c("sweep", ".value"),
+               names_pattern = "(.)(.*)")
+
+df_long
+```
+
+# Reshaping from Long to Wide
+We can also reshape the data from long to wide format using the `pivot_wider()` function. In this case, we want to create two new columns for each sweep: one for height and one for weight. We specify the columns to be reshaped using the `values_from` argument, provide the new column names in the `names_from` argument, and use the `names_glue` argument to specify the new column names. The `names_glue` argument uses curly braces (`{}`) to reference the values from the `names_from` and `.value` arguments. As we are specifying multiple columns in `values_from`, `.value` is a placeholder for the variable name.
+
+```{r}
+df_long %>%
+  pivot_wider(names_from = sweep,
+              values_from = matches("C(W|H)T"),
+              names_glue = "{sweep}{.value}")
+```
+
+# Reshape a Cleaned Dataset from Long to Wide
+It is likely that you will not just need to reshape raw data, but cleaned data too. In the next two sections we offer advice on naming variables so that they are easy to select and reshape in long or wide formats. First, let's clean the long dataset by converting the `cnum` and `sweep` columns to integers, creating a new column for follow-up time, and creating new `height` and `weight` variables that replace negative values in the raw height and weight data with `NA` (as well as giving these variables more easy-to-understand names).
+
+
+```{r}
+df_long_clean <- df_long %>%
+  mutate(cnum = as.integer(cnum),
+         sweep = match(sweep, LETTERS),
+         fup = fups[sweep],
+         height = ifelse(CHTCM00 > 0, CHTCM00, NA),
+         weight = ifelse(CWTCM00 > 0, CWTCM00, NA)) %>%
+  select(MCSID, cnum, fup, height, weight)
+```
+
+To reshape the clean data from long to wide format, we can use the `pivot_wider()` function as before. This time, we specify the columns to be reshaped using the `names_from` argument, provide the new column names in the `values_from` argument, and use the `names_glue` argument to specify the new column names. The `names_glue` argument uses curly braces (`{}`) to reference the values from the `names_from` and `.value` arguments. As we are specifying multiple columns in `values_from`, `.value` is a placeholder for the variable name.
+
+
+```{r}
+df_wide_clean <- df_long_clean %>%
+  pivot_wider(names_from = fup,
+              values_from = c(height, weight),
+              names_glue = "{.value}_{fup}y")
+
+df_wide_clean
+```
+
+# Reshape a Cleaned Dataset from Long to Wide
+Finally, we can reshape the clean wide dataset back to long format using the `pivot_longer()` function. We specify the columns to be reshaped using the `cols` argument, provide the new variable names in the `names_to` argument, and the pattern the existing column names take using the `names_pattern` argument. For `names_pattern` we specify `"(.*)_(.*)y"`, which breaks the column name into two pieces: the variable name (`"(.*)"`), and the follow-up time (`"(.*)y"`). We also use the `names_transform` argument to convert the follow-up time to an integer.
+
+```{r}
+df_wide_clean %>%
+  pivot_longer(cols = matches("_.*y$"),
+               names_to = c(".value", "fup"),
+               names_pattern = "(.*)_(\\d+)y$",
+               names_transform = list(fup = as.integer))
+```
diff --git a/quarto/misc-r_primer.qmd b/quarto/misc-r_primer.qmd
@@ -0,0 +1,115 @@
+---
+layout: default
+title: "A Primer on R"
+nav_order: 2
+format: docusaurus-md
+---
+
+# Introduction
+# The `tidyverse`
+# The pipe (`%>%`)
+# `haven::read_dta()` and the `labelled` package
+# Functions for Data Munging
+## select/rename, mutate/summarise, filter, group_by()
+# tidyselect, stringr and Regular Expressions, pick()
+# `glue()`
+# Repeating Yourself: Anonymous functions, across, map, and rename_with
+# Reshaping
+# Mutating and Filtering Joins
+
+
+In this tutorial, we will learn how to reshape data from long to wide (and vice versa) using the `tidyverse` package in `R`. We will use data on cohort member's height and weight collected in Sweeps 2-7 to demonstrate the process.
+
+```{r}
+#| warning: false
+# Load Packages
+library(tidyverse) # For data manipulation
+library(haven) # For importing .dta files
+library(glue) # For creating strings
+```
+
+```{r}
+#| include: false
+# setwd(Sys.getenv("mcs_fld"))
+```
+
+# Reshaping from Wide to Long
+
+We begin by loading the data from each sweep and merging these together into a single wide format data frame; see [Combining Data Across Sweeps](https://cls-data.github.io/docs/mcs-merging_across_sweeps.html) for more details. Note, the names of the height and weight variables in Sweep 5 (`ECHTCMA0` and `ECWTCMAO`) diverge slightly from the rubric used for other sweeps (`[A-G]CHTCM00` and `[A-G]CWTCM00` where `[A-G]` denotes sweep), hence the need for the complex regular expression in `read_dta(col_select = ...)` function call. To simplify the names of the columns in the wide dataset, we rename the Sweep 5 variables so they follow the rubric for Sweeps 2-4 and 6-7.
+
+```{r}
+fups <- c(0, 3, 5, 7, 11, 14, 17)
+
+load_height_wide <- function(sweep){
+  fup <- fups[sweep]
+  prefix <- LETTERS[sweep]
+  
+  glue("{fup}y/mcs{sweep}_cm_interview.dta") %>%
+    read_dta(col_select = c("MCSID", matches("^.(CNUM00|CHTCM(A|0)0)"))) %>%
+    rename(cnum = matches("CNUM00"))
+}
+
+df_wide <- map(2:7, load_height_wide) %>%
+  reduce(~ full_join(.x, .y, by = c("MCSID", "cnum"))) %>%
+  rename(ECHTCM00 = ECHTCMA0, ECWTCMA00 = ECWTCMA0)
+
+str(df_wide)
+```
+
+`df_wide` has 14 columns. Besides, the identifiers, `MCSID` and `cnum`, there are 12 columns for height and weight measurements at each sweep. Each of these 12 columns is prefixed by a single letter indicating the sweep. We can reshape the dataset into long format (one row per person x sweep combination) using the `pivot_longer()` function so that the resulting data frame has five columns: two person identifiers, a variable for sweep, and variables for height and weight. We specify the columns to be reshaped using the `cols` argument, provide the new variable names in the `names_to` argument, and the pattern the existing column names take using the `names_pattern` argument. For `names_pattern` we specify `"(.)(.*)"`, which breaks the column name into two pieces: the first character (`"(.)"`) and the rest of the name (`"(.*)"`). As noted, the first character holds information on sweep. In `names_to`, `.value` is a placeholder for the second piece of the column name.
+
+```{r}
+df_long <- df_wide %>%
+  pivot_longer(cols = matches("C(H|W)TCM00"),
+               names_to = c("sweep", ".value"),
+               names_pattern = "(.)(.*)")
+
+df_long
+```
+
+# Reshaping from Long to Wide
+We can also reshape the data from long to wide format using the `pivot_wider()` function. In this case, we want to create two new columns for each sweep: one for height and one for weight. We specify the columns to be reshaped using the `values_from` argument, provide the new column names in the `names_from` argument, and use the `names_glue` argument to specify the new column names. The `names_glue` argument uses curly braces (`{}`) to reference the values from the `names_from` and `.value` arguments. As we are specifying multiple columns in `values_from`, `.value` is a placeholder for the variable name.
+
+```{r}
+df_long %>%
+  pivot_wider(names_from = sweep,
+              values_from = matches("C(W|H)T"),
+              names_glue = "{sweep}{.value}")
+```
+
+# Reshape a Cleaned Dataset from Long to Wide
+It is likely that you will not just need to reshape raw data, but cleaned data too. In the next two sections we offer advice on naming variables so that they are easy to select and reshape in long or wide formats. First, let's clean the long dataset by converting the `cnum` and `sweep` columns to integers, creating a new column for follow-up time, and creating new `height` and `weight` variables that replace negative values in the raw height and weight data with `NA` (as well as giving these variables more easy-to-understand names).
+
+
+```{r}
+df_long_clean <- df_long %>%
+  mutate(cnum = as.integer(cnum),
+         sweep = match(sweep, LETTERS),
+         fup = fups[sweep],
+         height = ifelse(CHTCM00 > 0, CHTCM00, NA),
+         weight = ifelse(CWTCM00 > 0, CWTCM00, NA)) %>%
+  select(MCSID, cnum, fup, height, weight)
+```
+
+To reshape the clean data from long to wide format, we can use the `pivot_wider()` function as before. This time, we specify the columns to be reshaped using the `names_from` argument, provide the new column names in the `values_from` argument, and use the `names_glue` argument to specify the new column names. The `names_glue` argument uses curly braces (`{}`) to reference the values from the `names_from` and `.value` arguments. As we are specifying multiple columns in `values_from`, `.value` is a placeholder for the variable name.
+
+
+```{r}
+df_wide_clean <- df_long_clean %>%
+  pivot_wider(names_from = fup,
+              values_from = c(height, weight),
+              names_glue = "{.value}_{fup}y")
+
+df_wide_clean
+```
+
+# Reshape a Cleaned Dataset from Long to Wide
+Finally, we can reshape the clean wide dataset back to long format using the `pivot_longer()` function. We specify the columns to be reshaped using the `cols` argument, provide the new variable names in the `names_to` argument, and the pattern the existing column names take using the `names_pattern` argument. For `names_pattern` we specify `"(.*)_(.*)y"`, which breaks the column name into two pieces: the variable name (`"(.*)"`), and the follow-up time (`"(.*)y"`). We also use the `names_transform` argument to convert the follow-up time to an integer.
+
+```{r}
+df_wide_clean %>%
+  pivot_longer(cols = matches("_.*y$"),
+               names_to = c(".value", "fup"),
+               names_pattern = "(.*)_(\\d+)y$",
+               names_transform = list(fup = as.integer))
+```
diff --git a/scripts/mcs-determining_the_sample.R b/scripts/mcs-determining_the_sample.R
@@ -0,0 +1,3 @@
+# Use code from a previous section and note that you are not left with a row for everyone
+# Use household_response file with uncount and then left_join to get an appropriate sample
+# Use semi_join() to keep rows subject to a criterion
diff --git a/scripts/mcs-merging_within_sweep.R b/scripts/mcs-merging_within_sweep.R
@@ -0,0 +1 @@
+# Use examples from YouTube video.
diff --git a/scripts/mcs-munging_longitudinal_data.R b/scripts/mcs-munging_longitudinal_data.R
@@ -0,0 +1,3 @@
+# In long format, get someone's ethnicity to determine first()
+# Use count() and add_count() to examine unique values
+# Fill in across sweeps for variable available in only one sweep (or mention joins instead)
diff --git a/scripts/mcs-reshape_long_wide.R b/scripts/mcs-reshape_long_wide.R
@@ -0,0 +1,69 @@
+library(tidyverse)
+library(haven)
+library(glue)
+library(labelled)
+
+rm(list = ls())
+
+setwd(Sys.getenv("mcs_fld"))
+
+# 1. ----
+fups <- c(0, 3, 5, 7, 11, 14, 17)
+
+load_height_wide <- function(sweep){
+  fup <- fups[sweep]
+  prefix <- LETTERS[sweep]
+  
+  glue("{fup}y/mcs{sweep}_cm_interview.dta") %>%
+    read_dta(col_select = c("MCSID", matches("^.(CNUM00|C(W|H)TCM(A|0)0)"))) %>%
+    rename(cnum = matches("CNUM00"))
+}
+
+df_wide <- map(2:7, load_height_wide) %>%
+  reduce(~ full_join(.x, .y, by = c("MCSID", "cnum")))
+
+df_long <- df_wide %>%
+  pivot_longer(cols = -c(MCSID, cnum),
+               names_to = c("sweep", ".value"),
+               names_pattern = "(.)(.*)")
+
+# 2. ----
+df_long %>%
+  pivot_wider(names_from = sweep,
+              values_from = matches("C(W|H)T"),
+              names_glue = "{sweep}{.value}")
+
+df_long %>%
+  mutate(cnum = as.integer(cnum),
+         sweep = match(sweep, LETTERS),
+         fup = fups[sweep],
+         height = ifelse(!is.na(CHTCM00), CHTCM00, CHTCMA0),
+         weight = ifelse(!is.na(CWTCM00), CWTCM00, CWTCMA0)) %>%
+  select(MCSID, cnum, fup, height, weight) %>%
+  pivot_wider(names_from = fup,
+              values_from = c(height, weight),
+              names_glue = "{.value}_{fup}y")
+
+
+df_long_clean <- df_long %>%
+  mutate(cnum = as.integer(cnum),
+         sweep = match(sweep, LETTERS),
+         fup = fups[sweep],
+         height = ifelse(CHTCM00 > 0, CHTCM00, NA),
+         weight = ifelse(CWTCM00 > 0, CWTCM00, NA)) %>%
+  select(MCSID, cnum, fup, height, weight)
+  
+df_wide_clean <- df_long_clean %>%
+  pivot_wider(names_from = fup,
+              values_from = c(height, weight),
+              names_glue = "{.value}_{fup}y")
+
+df_wide_clean
+
+
+df_wide_clean %>%
+  pivot_longer(cols = matches("_(\\d+)$"),
+               names_to = c(".value", "fup"),
+               names_pattern = "(.*)_(\\d+)$",
+               names_transform = list(fup = as.integer))
+
diff --git a/scripts/misc-introduction_r.R b/scripts/misc-introduction_r.R

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Use code from a previous section and note that you are not left with a row for everyone`
	`2`	`+# Use household_response file with uncount and then left_join to get an appropriate sample`
	`3`	`+# Use semi_join() to keep rows subject to a criterion`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# In long format, get someone's ethnicity to determine first()`
	`2`	`+# Use count() and add_count() to examine unique values`
	`3`	`+# Fill in across sweeps for variable available in only one sweep (or mention joins instead)`