Skip to content

Commit 01c132a

Browse files
committed
add merge and reshape
1 parent 4649fbe commit 01c132a

9 files changed

+334
-28
lines changed

docs/mcs-household_grid.md

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -199,21 +199,21 @@ df_0y_hhgrid_prel <- read_dta("0y/mcs1_hhgrid.dta") %>%
199199
select(MCSID, APNUM00, matches("AHPREL[A-Z]0"))
200200

201201
df_0y_hhgrid_prel %>%
202-
select(MCSID, APNUM00, AHPRELA0, AHPRELB0, AHPRELC0, AHPRELD0) %>%
203-
filter(MCSID == "M10001N") # To look at just one family
202+
filter(MCSID == "M10001N") %>% # To look at just one family
203+
select(APNUM00, AHPRELA0, AHPRELB0, AHPRELC0)
204204
```
205205

206206
``` text
207-
# A tibble: 7 × 6
208-
MCSID APNUM00 AHPRELA0 AHPRELB0 AHPRELC0 AHPRELD0
209-
<chr> <dbl> <dbl+lbl> <dbl+lbl> <dbl+lb> <dbl+lb>
210-
1 M10001N 1 96 [Self] 1 [Husband/Wife] 7 [Nat… 7 [Nat
211-
2 M10001N 2 1 [Husband/Wife] 96 [Self] 7 [Nat… 7 [Nat
212-
3 M10001N 3 3 [Natural son/daughter] 3 [Natural son/d… 96 [Sel… 11 [Nat…
213-
4 M10001N 4 3 [Natural son/daughter] 3 [Natural son/d… 11 [Nat… 96 [Sel
214-
5 M10001N 5 3 [Natural son/daughter] 3 [Natural son/d… 11 [Nat… 11 [Nat
215-
6 M10001N 6 3 [Natural son/daughter] 3 [Natural son/d… 11 [Nat… 11 [Nat
216-
7 M10001N 100 3 [Natural son/daughter] 3 [Natural son/d… 11 [Nat… 11 [Nat
207+
# A tibble: 7 × 4
208+
APNUM00 AHPRELA0 AHPRELB0 AHPRELC0
209+
<dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
210+
1 1 96 [Self] 1 [Husband/Wife] 7 [Natural paren
211+
2 2 1 [Husband/Wife] 96 [Self] 7 [Natural paren
212+
3 3 3 [Natural son/daughter] 3 [Natural son/daughter] 96 [Self]
213+
4 4 3 [Natural son/daughter] 3 [Natural son/daughter] 11 [Natural broth
214+
5 5 3 [Natural son/daughter] 3 [Natural son/daughter] 11 [Natural broth
215+
6 6 3 [Natural son/daughter] 3 [Natural son/daughter] 11 [Natural broth
216+
7 100 3 [Natural son/daughter] 3 [Natural son/daughter] 11 [Natural broth
217217
```
218218

219219
There are seven members in this family, one of whom is a cohort member
@@ -238,23 +238,23 @@ df_0y_hhgrid_prel %>%
238238
values_to = "relationship") %>%
239239
mutate(APNUM00_alt = match(str_sub(alt, 7, 7), LETTERS)) %>%
240240
filter(relationship == 1) %>%
241-
select(MCSID, APNUM00, parent_pnum = APNUM00_alt)
241+
select(MCSID, APNUM00, partner_pnum = APNUM00_alt)
242242
```
243243

244244
``` text
245245
# A tibble: 23,616 × 3
246-
MCSID APNUM00 parent_pnum
247-
<chr> <dbl> <int>
248-
1 M10001N 1 2
249-
2 M10001N 2 1
250-
3 M10002P 1 2
251-
4 M10002P 2 1
252-
5 M10007U 1 2
253-
6 M10007U 2 1
254-
7 M10011Q 1 2
255-
8 M10011Q 2 1
256-
9 M10015U 1 2
257-
10 M10015U 2 1
246+
MCSID APNUM00 partner_pnum
247+
<chr> <dbl> <int>
248+
1 M10001N 1 2
249+
2 M10001N 2 1
250+
3 M10002P 1 2
251+
4 M10002P 2 1
252+
5 M10007U 1 2
253+
6 M10007U 2 1
254+
7 M10011Q 1 2
255+
8 M10011Q 2 1
256+
9 M10015U 1 2
257+
10 M10015U 2 1
258258
# ℹ 23,606 more rows
259259
```
260260

quarto/mcs-household_grid.qmd

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ df_0y_hhgrid_prel <- read_dta("0y/mcs1_hhgrid.dta") %>%
8484
select(MCSID, APNUM00, matches("AHPREL[A-Z]0"))
8585
8686
df_0y_hhgrid_prel %>%
87-
select(MCSID, APNUM00, AHPRELA0, AHPRELB0, AHPRELC0, AHPRELD0) %>%
88-
filter(MCSID == "M10001N") # To look at just one family
87+
filter(MCSID == "M10001N") %>% # To look at just one family
88+
select(APNUM00, AHPRELA0, AHPRELB0, AHPRELC0)
8989
```
9090

9191
There are seven members in this family, one of whom is a cohort member (`APNUM00 == 100`). `APNUM00`'s 1 and 2 are the (natural) parents, and `APNUM00`'s 3-6 and 100 are the (natural) children. The relationship variables show that `APNUM00`'s 1 and 2 are married, and `APNUM00`'s 3-7 are siblings. Note, the symmetry in the relationships. Where, `APNUM00 == 1`, `AHPRELC0 == 7 [Natural Parent]` and where `APNUM00 == 3`, `AHPRELA0 == 3 [Natural Child]`.
@@ -99,7 +99,7 @@ df_0y_hhgrid_prel %>%
9999
values_to = "relationship") %>%
100100
mutate(APNUM00_alt = match(str_sub(alt, 7, 7), LETTERS)) %>%
101101
filter(relationship == 1) %>%
102-
select(MCSID, APNUM00, parent_pnum = APNUM00_alt)
102+
select(MCSID, APNUM00, partner_pnum = APNUM00_alt)
103103
```
104104

105105
# Coda

quarto/mcs-reshape_long_wide.qmd

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
---
2+
layout: default
3+
title: "Reshaping Data from Long to Wide (or Wide to Long)"
4+
nav_order: 4
5+
parent: MCS
6+
format: docusaurus-md
7+
---
8+
9+
# Introduction
10+
11+
In this tutorial, we will learn how to reshape data from long to wide (and vice versa) using the `tidyverse` package in `R`. We will use data on cohort member's height and weight collected in Sweeps 2-7 to demonstrate the process.
12+
13+
```{r}
14+
#| warning: false
15+
# Load Packages
16+
library(tidyverse) # For data manipulation
17+
library(haven) # For importing .dta files
18+
library(glue) # For creating strings
19+
```
20+
21+
```{r}
22+
#| include: false
23+
# setwd(Sys.getenv("mcs_fld"))
24+
```
25+
26+
# Reshaping from Wide to Long
27+
28+
We begin by loading the data from each sweep and merging these together into a single wide format data frame; see [Combining Data Across Sweeps](https://cls-data.github.io/docs/mcs-merging_across_sweeps.html) for more details. Note, the names of the height and weight variables in Sweep 5 (`ECHTCMA0` and `ECWTCMAO`) diverge slightly from the rubric used for other sweeps (`[A-G]CHTCM00` and `[A-G]CWTCM00` where `[A-G]` denotes sweep), hence the need for the complex regular expression in `read_dta(col_select = ...)` function call. To simplify the names of the columns in the wide dataset, we rename the Sweep 5 variables so they follow the rubric for Sweeps 2-4 and 6-7.
29+
30+
```{r}
31+
fups <- c(0, 3, 5, 7, 11, 14, 17)
32+
33+
load_height_wide <- function(sweep){
34+
fup <- fups[sweep]
35+
prefix <- LETTERS[sweep]
36+
37+
glue("{fup}y/mcs{sweep}_cm_interview.dta") %>%
38+
read_dta(col_select = c("MCSID", matches("^.(CNUM00|CHTCM(A|0)0)"))) %>%
39+
rename(cnum = matches("CNUM00"))
40+
}
41+
42+
df_wide <- map(2:7, load_height_wide) %>%
43+
reduce(~ full_join(.x, .y, by = c("MCSID", "cnum"))) %>%
44+
rename(ECHTCM00 = ECHTCMA0, ECWTCMA00 = ECWTCMA0)
45+
46+
str(df_wide)
47+
```
48+
49+
`df_wide` has 14 columns. Besides, the identifiers, `MCSID` and `cnum`, there are 12 columns for height and weight measurements at each sweep. Each of these 12 columns is prefixed by a single letter indicating the sweep. We can reshape the dataset into long format (one row per person x sweep combination) using the `pivot_longer()` function so that the resulting data frame has five columns: two person identifiers, a variable for sweep, and variables for height and weight. We specify the columns to be reshaped using the `cols` argument, provide the new variable names in the `names_to` argument, and the pattern the existing column names take using the `names_pattern` argument. For `names_pattern` we specify `"(.)(.*)"`, which breaks the column name into two pieces: the first character (`"(.)"`) and the rest of the name (`"(.*)"`). As noted, the first character holds information on sweep. In `names_to`, `.value` is a placeholder for the second piece of the column name.
50+
51+
```{r}
52+
df_long <- df_wide %>%
53+
pivot_longer(cols = matches("C(H|W)TCM00"),
54+
names_to = c("sweep", ".value"),
55+
names_pattern = "(.)(.*)")
56+
57+
df_long
58+
```
59+
60+
# Reshaping from Long to Wide
61+
We can also reshape the data from long to wide format using the `pivot_wider()` function. In this case, we want to create two new columns for each sweep: one for height and one for weight. We specify the columns to be reshaped using the `values_from` argument, provide the new column names in the `names_from` argument, and use the `names_glue` argument to specify the new column names. The `names_glue` argument uses curly braces (`{}`) to reference the values from the `names_from` and `.value` arguments. As we are specifying multiple columns in `values_from`, `.value` is a placeholder for the variable name.
62+
63+
```{r}
64+
df_long %>%
65+
pivot_wider(names_from = sweep,
66+
values_from = matches("C(W|H)T"),
67+
names_glue = "{sweep}{.value}")
68+
```
69+
70+
# Reshape a Cleaned Dataset from Long to Wide
71+
It is likely that you will not just need to reshape raw data, but cleaned data too. In the next two sections we offer advice on naming variables so that they are easy to select and reshape in long or wide formats. First, let's clean the long dataset by converting the `cnum` and `sweep` columns to integers, creating a new column for follow-up time, and creating new `height` and `weight` variables that replace negative values in the raw height and weight data with `NA` (as well as giving these variables more easy-to-understand names).
72+
73+
74+
```{r}
75+
df_long_clean <- df_long %>%
76+
mutate(cnum = as.integer(cnum),
77+
sweep = match(sweep, LETTERS),
78+
fup = fups[sweep],
79+
height = ifelse(CHTCM00 > 0, CHTCM00, NA),
80+
weight = ifelse(CWTCM00 > 0, CWTCM00, NA)) %>%
81+
select(MCSID, cnum, fup, height, weight)
82+
```
83+
84+
To reshape the clean data from long to wide format, we can use the `pivot_wider()` function as before. This time, we specify the columns to be reshaped using the `names_from` argument, provide the new column names in the `values_from` argument, and use the `names_glue` argument to specify the new column names. The `names_glue` argument uses curly braces (`{}`) to reference the values from the `names_from` and `.value` arguments. As we are specifying multiple columns in `values_from`, `.value` is a placeholder for the variable name.
85+
86+
87+
```{r}
88+
df_wide_clean <- df_long_clean %>%
89+
pivot_wider(names_from = fup,
90+
values_from = c(height, weight),
91+
names_glue = "{.value}_{fup}y")
92+
93+
df_wide_clean
94+
```
95+
96+
# Reshape a Cleaned Dataset from Long to Wide
97+
Finally, we can reshape the clean wide dataset back to long format using the `pivot_longer()` function. We specify the columns to be reshaped using the `cols` argument, provide the new variable names in the `names_to` argument, and the pattern the existing column names take using the `names_pattern` argument. For `names_pattern` we specify `"(.*)_(.*)y"`, which breaks the column name into two pieces: the variable name (`"(.*)"`), and the follow-up time (`"(.*)y"`). We also use the `names_transform` argument to convert the follow-up time to an integer.
98+
99+
```{r}
100+
df_wide_clean %>%
101+
pivot_longer(cols = matches("_.*y$"),
102+
names_to = c(".value", "fup"),
103+
names_pattern = "(.*)_(\\d+)y$",
104+
names_transform = list(fup = as.integer))
105+
```

quarto/misc-r_primer.qmd

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
---
2+
layout: default
3+
title: "A Primer on R"
4+
nav_order: 2
5+
format: docusaurus-md
6+
---
7+
8+
# Introduction
9+
# The `tidyverse`
10+
# The pipe (`%>%`)
11+
# `haven::read_dta()` and the `labelled` package
12+
# Functions for Data Munging
13+
## select/rename, mutate/summarise, filter, group_by()
14+
# tidyselect, stringr and Regular Expressions, pick()
15+
# `glue()`
16+
# Repeating Yourself: Anonymous functions, across, map, and rename_with
17+
# Reshaping
18+
# Mutating and Filtering Joins
19+
20+
21+
In this tutorial, we will learn how to reshape data from long to wide (and vice versa) using the `tidyverse` package in `R`. We will use data on cohort member's height and weight collected in Sweeps 2-7 to demonstrate the process.
22+
23+
```{r}
24+
#| warning: false
25+
# Load Packages
26+
library(tidyverse) # For data manipulation
27+
library(haven) # For importing .dta files
28+
library(glue) # For creating strings
29+
```
30+
31+
```{r}
32+
#| include: false
33+
# setwd(Sys.getenv("mcs_fld"))
34+
```
35+
36+
# Reshaping from Wide to Long
37+
38+
We begin by loading the data from each sweep and merging these together into a single wide format data frame; see [Combining Data Across Sweeps](https://cls-data.github.io/docs/mcs-merging_across_sweeps.html) for more details. Note, the names of the height and weight variables in Sweep 5 (`ECHTCMA0` and `ECWTCMAO`) diverge slightly from the rubric used for other sweeps (`[A-G]CHTCM00` and `[A-G]CWTCM00` where `[A-G]` denotes sweep), hence the need for the complex regular expression in `read_dta(col_select = ...)` function call. To simplify the names of the columns in the wide dataset, we rename the Sweep 5 variables so they follow the rubric for Sweeps 2-4 and 6-7.
39+
40+
```{r}
41+
fups <- c(0, 3, 5, 7, 11, 14, 17)
42+
43+
load_height_wide <- function(sweep){
44+
fup <- fups[sweep]
45+
prefix <- LETTERS[sweep]
46+
47+
glue("{fup}y/mcs{sweep}_cm_interview.dta") %>%
48+
read_dta(col_select = c("MCSID", matches("^.(CNUM00|CHTCM(A|0)0)"))) %>%
49+
rename(cnum = matches("CNUM00"))
50+
}
51+
52+
df_wide <- map(2:7, load_height_wide) %>%
53+
reduce(~ full_join(.x, .y, by = c("MCSID", "cnum"))) %>%
54+
rename(ECHTCM00 = ECHTCMA0, ECWTCMA00 = ECWTCMA0)
55+
56+
str(df_wide)
57+
```
58+
59+
`df_wide` has 14 columns. Besides, the identifiers, `MCSID` and `cnum`, there are 12 columns for height and weight measurements at each sweep. Each of these 12 columns is prefixed by a single letter indicating the sweep. We can reshape the dataset into long format (one row per person x sweep combination) using the `pivot_longer()` function so that the resulting data frame has five columns: two person identifiers, a variable for sweep, and variables for height and weight. We specify the columns to be reshaped using the `cols` argument, provide the new variable names in the `names_to` argument, and the pattern the existing column names take using the `names_pattern` argument. For `names_pattern` we specify `"(.)(.*)"`, which breaks the column name into two pieces: the first character (`"(.)"`) and the rest of the name (`"(.*)"`). As noted, the first character holds information on sweep. In `names_to`, `.value` is a placeholder for the second piece of the column name.
60+
61+
```{r}
62+
df_long <- df_wide %>%
63+
pivot_longer(cols = matches("C(H|W)TCM00"),
64+
names_to = c("sweep", ".value"),
65+
names_pattern = "(.)(.*)")
66+
67+
df_long
68+
```
69+
70+
# Reshaping from Long to Wide
71+
We can also reshape the data from long to wide format using the `pivot_wider()` function. In this case, we want to create two new columns for each sweep: one for height and one for weight. We specify the columns to be reshaped using the `values_from` argument, provide the new column names in the `names_from` argument, and use the `names_glue` argument to specify the new column names. The `names_glue` argument uses curly braces (`{}`) to reference the values from the `names_from` and `.value` arguments. As we are specifying multiple columns in `values_from`, `.value` is a placeholder for the variable name.
72+
73+
```{r}
74+
df_long %>%
75+
pivot_wider(names_from = sweep,
76+
values_from = matches("C(W|H)T"),
77+
names_glue = "{sweep}{.value}")
78+
```
79+
80+
# Reshape a Cleaned Dataset from Long to Wide
81+
It is likely that you will not just need to reshape raw data, but cleaned data too. In the next two sections we offer advice on naming variables so that they are easy to select and reshape in long or wide formats. First, let's clean the long dataset by converting the `cnum` and `sweep` columns to integers, creating a new column for follow-up time, and creating new `height` and `weight` variables that replace negative values in the raw height and weight data with `NA` (as well as giving these variables more easy-to-understand names).
82+
83+
84+
```{r}
85+
df_long_clean <- df_long %>%
86+
mutate(cnum = as.integer(cnum),
87+
sweep = match(sweep, LETTERS),
88+
fup = fups[sweep],
89+
height = ifelse(CHTCM00 > 0, CHTCM00, NA),
90+
weight = ifelse(CWTCM00 > 0, CWTCM00, NA)) %>%
91+
select(MCSID, cnum, fup, height, weight)
92+
```
93+
94+
To reshape the clean data from long to wide format, we can use the `pivot_wider()` function as before. This time, we specify the columns to be reshaped using the `names_from` argument, provide the new column names in the `values_from` argument, and use the `names_glue` argument to specify the new column names. The `names_glue` argument uses curly braces (`{}`) to reference the values from the `names_from` and `.value` arguments. As we are specifying multiple columns in `values_from`, `.value` is a placeholder for the variable name.
95+
96+
97+
```{r}
98+
df_wide_clean <- df_long_clean %>%
99+
pivot_wider(names_from = fup,
100+
values_from = c(height, weight),
101+
names_glue = "{.value}_{fup}y")
102+
103+
df_wide_clean
104+
```
105+
106+
# Reshape a Cleaned Dataset from Long to Wide
107+
Finally, we can reshape the clean wide dataset back to long format using the `pivot_longer()` function. We specify the columns to be reshaped using the `cols` argument, provide the new variable names in the `names_to` argument, and the pattern the existing column names take using the `names_pattern` argument. For `names_pattern` we specify `"(.*)_(.*)y"`, which breaks the column name into two pieces: the variable name (`"(.*)"`), and the follow-up time (`"(.*)y"`). We also use the `names_transform` argument to convert the follow-up time to an integer.
108+
109+
```{r}
110+
df_wide_clean %>%
111+
pivot_longer(cols = matches("_.*y$"),
112+
names_to = c(".value", "fup"),
113+
names_pattern = "(.*)_(\\d+)y$",
114+
names_transform = list(fup = as.integer))
115+
```
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Use code from a previous section and note that you are not left with a row for everyone
2+
# Use household_response file with uncount and then left_join to get an appropriate sample
3+
# Use semi_join() to keep rows subject to a criterion

scripts/mcs-merging_within_sweep.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Use examples from YouTube video.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# In long format, get someone's ethnicity to determine first()
2+
# Use count() and add_count() to examine unique values
3+
# Fill in across sweeps for variable available in only one sweep (or mention joins instead)

scripts/mcs-reshape_long_wide.R

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
library(tidyverse)
2+
library(haven)
3+
library(glue)
4+
library(labelled)
5+
6+
rm(list = ls())
7+
8+
setwd(Sys.getenv("mcs_fld"))
9+
10+
# 1. ----
11+
fups <- c(0, 3, 5, 7, 11, 14, 17)
12+
13+
load_height_wide <- function(sweep){
14+
fup <- fups[sweep]
15+
prefix <- LETTERS[sweep]
16+
17+
glue("{fup}y/mcs{sweep}_cm_interview.dta") %>%
18+
read_dta(col_select = c("MCSID", matches("^.(CNUM00|C(W|H)TCM(A|0)0)"))) %>%
19+
rename(cnum = matches("CNUM00"))
20+
}
21+
22+
df_wide <- map(2:7, load_height_wide) %>%
23+
reduce(~ full_join(.x, .y, by = c("MCSID", "cnum")))
24+
25+
df_long <- df_wide %>%
26+
pivot_longer(cols = -c(MCSID, cnum),
27+
names_to = c("sweep", ".value"),
28+
names_pattern = "(.)(.*)")
29+
30+
# 2. ----
31+
df_long %>%
32+
pivot_wider(names_from = sweep,
33+
values_from = matches("C(W|H)T"),
34+
names_glue = "{sweep}{.value}")
35+
36+
df_long %>%
37+
mutate(cnum = as.integer(cnum),
38+
sweep = match(sweep, LETTERS),
39+
fup = fups[sweep],
40+
height = ifelse(!is.na(CHTCM00), CHTCM00, CHTCMA0),
41+
weight = ifelse(!is.na(CWTCM00), CWTCM00, CWTCMA0)) %>%
42+
select(MCSID, cnum, fup, height, weight) %>%
43+
pivot_wider(names_from = fup,
44+
values_from = c(height, weight),
45+
names_glue = "{.value}_{fup}y")
46+
47+
48+
df_long_clean <- df_long %>%
49+
mutate(cnum = as.integer(cnum),
50+
sweep = match(sweep, LETTERS),
51+
fup = fups[sweep],
52+
height = ifelse(CHTCM00 > 0, CHTCM00, NA),
53+
weight = ifelse(CWTCM00 > 0, CWTCM00, NA)) %>%
54+
select(MCSID, cnum, fup, height, weight)
55+
56+
df_wide_clean <- df_long_clean %>%
57+
pivot_wider(names_from = fup,
58+
values_from = c(height, weight),
59+
names_glue = "{.value}_{fup}y")
60+
61+
df_wide_clean
62+
63+
64+
df_wide_clean %>%
65+
pivot_longer(cols = matches("_(\\d+)$"),
66+
names_to = c(".value", "fup"),
67+
names_pattern = "(.*)_(\\d+)$",
68+
names_transform = list(fup = as.integer))
69+

0 commit comments

Comments
 (0)