|
| 1 | +--- |
| 2 | +layout: default |
| 3 | +title: Reshaping Data from Long to Wide (or Wide to Long) |
| 4 | +nav_order: 4 |
| 5 | +parent: MCS |
| 6 | +format: docusaurus-md |
| 7 | +--- |
| 8 | + |
| 9 | + |
| 10 | + |
| 11 | + |
| 12 | +# Introduction |
| 13 | + |
| 14 | +In this tutorial, we will learn how to reshape data from long to wide |
| 15 | +(and vice versa) using the `tidyverse` package in `R`. We will use data |
| 16 | +on cohort member’s height and weight collected in Sweeps 2-7 to |
| 17 | +demonstrate the process. |
| 18 | + |
| 19 | +```r |
| 20 | +# Load Packages |
| 21 | +library(tidyverse) # For data manipulation |
| 22 | +library(haven) # For importing .dta files |
| 23 | +library(glue) # For creating strings |
| 24 | +``` |
| 25 | + |
| 26 | +# Reshaping from Wide to Long |
| 27 | + |
| 28 | +We begin by loading the data from each sweep and merging these together |
| 29 | +into a single wide format data frame; see [Combining Data Across |
| 30 | +Sweeps](https://cls-data.github.io/docs/mcs-merging_across_sweeps.html) |
| 31 | +for more details. Note, the names of the height and weight variables in |
| 32 | +Sweep 5 (`ECHTCMA0` and `ECWTCMAO`) diverge slightly from the rubric |
| 33 | +used for other sweeps (`[A-G]CHTCM00` and `[A-G]CWTCM00` where `[A-G]` |
| 34 | +denotes sweep), hence the need for the complex regular expression in |
| 35 | +`read_dta(col_select = ...)` function call. To simplify the names of the |
| 36 | +columns in the wide dataset, we rename the Sweep 5 variables so they |
| 37 | +follow the rubric for Sweeps 2-4 and 6-7. |
| 38 | + |
| 39 | +```r |
| 40 | +fups <- c(0, 3, 5, 7, 11, 14, 17) |
| 41 | + |
| 42 | +load_height_wide <- function(sweep){ |
| 43 | + fup <- fups[sweep] |
| 44 | + prefix <- LETTERS[sweep] |
| 45 | + |
| 46 | + glue("{fup}y/mcs{sweep}_cm_interview.dta") %>% |
| 47 | + read_dta(col_select = c("MCSID", matches("^.(CNUM00|C(H|W)TCM(A|0)0)"))) %>% |
| 48 | + rename(cnum = matches("CNUM00")) |
| 49 | +} |
| 50 | + |
| 51 | +df_wide <- map(2:7, load_height_wide) %>% |
| 52 | + reduce(~ full_join(.x, .y, by = c("MCSID", "cnum"))) %>% |
| 53 | + rename(ECHTCM00 = ECHTCMA0, ECWTCMA00 = ECWTCMA0) |
| 54 | + |
| 55 | +str(df_wide) |
| 56 | +``` |
| 57 | + |
| 58 | +``` text |
| 59 | +tibble [17,614 × 13] (S3: tbl_df/tbl/data.frame) |
| 60 | + $ MCSID : chr [1:17614] "M10001N" "M10002P" "M10007U" "M10008V" ... |
| 61 | + ..- attr(*, "label")= chr "MCS Research ID - Anonymised Family/Household Identifier" |
| 62 | + ..- attr(*, "format.stata")= chr "%7s" |
| 63 | + $ cnum : dbl+lbl [1:17614] 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1... |
| 64 | + ..@ labels: Named num [1:3] 1 2 3 |
| 65 | + .. ..- attr(*, "names")= chr [1:3] "1st Cohort Member of the family" "2nd Cohort Member of the family" "3rd Cohort Member of the family" |
| 66 | + ..@ label : chr "Cohort Member number within an MCS family" |
| 67 | + $ BCHTCM00 : dbl+lbl [1:17614] 97, 96, 102, -2, -2, 106, 97, 94, 102, 99, 9... |
| 68 | + ..@ label : chr "PHYS Child's standing height (cm)" |
| 69 | + ..@ format.stata: chr "%8.0g" |
| 70 | + ..@ labels : Named num [1:2] -2 -1 |
| 71 | + .. ..- attr(*, "names")= chr [1:2] "No Measurement taken" "Not answered / missing" |
| 72 | + $ CCHTCM00 : dbl+lbl [1:17614] 114, 110, 118, NA, NA, 121, NA, 110, 118, 110, 11... |
| 73 | + ..@ label : chr "PHYS: Height in cms" |
| 74 | + ..@ format.stata: chr "%12.0g" |
| 75 | + ..@ labels : Named num [1:5] -9 -8 -1 99998 99999 |
| 76 | + .. ..- attr(*, "names")= chr [1:5] "Refusal" "Don't Know" "Not applicable" "Refusal" ... |
| 77 | + $ CCWTCM00 : dbl+lbl [1:17614] 21.2, 19.2, 25.3, NA, NA, 32.9, NA, 19.7, 23.0... |
| 78 | + ..@ label : chr "PHYS: Weight in Kilograms" |
| 79 | + ..@ format.stata: chr "%12.0g" |
| 80 | + ..@ labels : Named num [1:3] -9 -8 -1 |
| 81 | + .. ..- attr(*, "names")= chr [1:3] "Refusal" "Don't Know" "Not applicable" |
| 82 | + $ DCHTCM00 : dbl+lbl [1:17614] 128, 123, 129, NA, NA, 137, NA, 122, 130, 121, 12... |
| 83 | + ..@ label : chr "Height in cms" |
| 84 | + ..@ format.stata: chr "%12.0g" |
| 85 | + ..@ labels : Named num [1:3] -9 -8 -1 |
| 86 | + .. ..- attr(*, "names")= chr [1:3] "Refusal" "Don''t Know" "Not applicable" |
| 87 | + $ DCWTCM00 : dbl+lbl [1:17614] 25.5, 26.2, 26.5, NA, NA, 51.2, NA, 24.1, 29.0... |
| 88 | + ..@ label : chr "Weight in Kilos" |
| 89 | + ..@ format.stata: chr "%12.0g" |
| 90 | + ..@ labels : Named num [1:3] -9 -8 -1 |
| 91 | + .. ..- attr(*, "names")= chr [1:3] "Refusal" "Don''t Know" "Not applicable" |
| 92 | + $ ECHTCM00 : dbl+lbl [1:17614] NA, 144, 154, NA, NA, 168, NA, 143, 152, NA, 15... |
| 93 | + ..@ label : chr "Height in cms" |
| 94 | + ..@ format.stata: chr "%12.0g" |
| 95 | + ..@ labels : Named num [1:2] -7 -1 |
| 96 | + .. ..- attr(*, "names")= chr [1:2] "No answer" "Not applicable" |
| 97 | + $ ECWTCMA00: dbl+lbl [1:17614] NA, 41.8, 40.6, NA, NA, 74.0, NA, 38.2, 41.5... |
| 98 | + ..@ label : chr "Weight in kilos" |
| 99 | + ..@ format.stata: chr "%12.0g" |
| 100 | + ..@ labels : Named num [1:2] -7 -1 |
| 101 | + .. ..- attr(*, "names")= chr [1:2] "No answer" "Not applicable" |
| 102 | + $ FCHTCM00 : dbl+lbl [1:17614] NA, 163, 174, NA, NA, NA, NA, 164, 167, NA, 16... |
| 103 | + ..@ label : chr "Height in centimeters" |
| 104 | + ..@ format.stata: chr "%12.0g" |
| 105 | + ..@ labels : Named num [1:2] -5 -1 |
| 106 | + .. ..- attr(*, "names")= chr [1:2] "UNABLE TO OBTAIN HEIGHT MEASUREMENT" "Not applicable" |
| 107 | + $ FCWTCM00 : dbl+lbl [1:17614] NA, 52.3, 57.1, NA, NA, NA, NA, 56.2, 51.5... |
| 108 | + ..@ label : chr "Weight in kilos" |
| 109 | + ..@ format.stata: chr "%12.0g" |
| 110 | + ..@ labels : Named num [1:2] -5 -1 |
| 111 | + .. ..- attr(*, "names")= chr [1:2] "UNABLE TO OBTAIN HEIGHT MEASUREMENT" "Not applicable" |
| 112 | + $ GCHTCM00 : dbl+lbl [1:17614] NA, 174, 181, NA, NA, NA, NA, 169, 185, NA, 16... |
| 113 | + ..@ label : chr "Height in cms" |
| 114 | + ..@ format.stata: chr "%12.0g" |
| 115 | + ..@ labels : Named num [1:2] -5 -1 |
| 116 | + .. ..- attr(*, "names")= chr [1:2] "Unable to obtain height measurement" "Not applicable" |
| 117 | + $ GCWTCM00 : dbl+lbl [1:17614] NA, 59.4, 71.4, NA, NA, NA, NA, 75... |
| 118 | + ..@ label : chr "Weight in kilos" |
| 119 | + ..@ format.stata: chr "%12.0g" |
| 120 | + ..@ labels : Named num [1:2] -5 -1 |
| 121 | + .. ..- attr(*, "names")= chr [1:2] "Unable to obtain weight measurement" "Not applicable" |
| 122 | +``` |
| 123 | + |
| 124 | +`df_wide` has 14 columns. Besides, the identifiers, `MCSID` and `cnum`, |
| 125 | +there are 12 columns for height and weight measurements at each sweep. |
| 126 | +Each of these 12 columns is prefixed by a single letter indicating the |
| 127 | +sweep. We can reshape the dataset into long format (one row per person x |
| 128 | +sweep combination) using the `pivot_longer()` function so that the |
| 129 | +resulting data frame has five columns: two person identifiers, a |
| 130 | +variable for sweep, and variables for height and weight. We specify the |
| 131 | +columns to be reshaped using the `cols` argument, provide the new |
| 132 | +variable names in the `names_to` argument, and the pattern the existing |
| 133 | +column names take using the `names_pattern` argument. For |
| 134 | +`names_pattern` we specify `"(.)(.*)"`, which breaks the column name |
| 135 | +into two pieces: the first character (`"(.)"`) and the rest of the name |
| 136 | +(`"(.*)"`). As noted, the first character holds information on sweep. In |
| 137 | +`names_to`, `.value` is a placeholder for the second piece of the column |
| 138 | +name. |
| 139 | + |
| 140 | +```r |
| 141 | +df_long <- df_wide %>% |
| 142 | + pivot_longer(cols = matches("C(H|W)TCM00"), |
| 143 | + names_to = c("sweep", ".value"), |
| 144 | + names_pattern = "(.)(.*)") |
| 145 | +``` |
| 146 | + |
| 147 | +``` text |
| 148 | +Warning: `BCHTCM00` and `CCHTCM00` have conflicting value labels. |
| 149 | +ℹ Labels for these values will be taken from `BCHTCM00`. |
| 150 | +✖ Values: -1 |
| 151 | +``` |
| 152 | + |
| 153 | +``` text |
| 154 | +Warning: `BCHTCM00` and `DCHTCM00` have conflicting value labels. |
| 155 | +ℹ Labels for these values will be taken from `BCHTCM00`. |
| 156 | +✖ Values: -8 and -1 |
| 157 | +``` |
| 158 | + |
| 159 | +``` text |
| 160 | +Warning: `BCHTCM00` and `ECHTCM00` have conflicting value labels. |
| 161 | +ℹ Labels for these values will be taken from `BCHTCM00`. |
| 162 | +✖ Values: -1 |
| 163 | +``` |
| 164 | + |
| 165 | +``` text |
| 166 | +Warning: `BCHTCM00` and `FCHTCM00` have conflicting value labels. |
| 167 | +ℹ Labels for these values will be taken from `BCHTCM00`. |
| 168 | +✖ Values: -1 |
| 169 | +``` |
| 170 | + |
| 171 | +``` text |
| 172 | +Warning: `BCHTCM00` and `GCHTCM00` have conflicting value labels. |
| 173 | +ℹ Labels for these values will be taken from `BCHTCM00`. |
| 174 | +✖ Values: -5 and -1 |
| 175 | +``` |
| 176 | + |
| 177 | +``` text |
| 178 | +Warning: `CCWTCM00` and `DCWTCM00` have conflicting value labels. |
| 179 | +ℹ Labels for these values will be taken from `CCWTCM00`. |
| 180 | +✖ Values: -8 |
| 181 | +``` |
| 182 | + |
| 183 | +``` text |
| 184 | +Warning: `CCWTCM00` and `GCWTCM00` have conflicting value labels. |
| 185 | +ℹ Labels for these values will be taken from `CCWTCM00`. |
| 186 | +✖ Values: -5 |
| 187 | +``` |
| 188 | + |
| 189 | +```r |
| 190 | +df_long |
| 191 | +``` |
| 192 | + |
| 193 | +``` text |
| 194 | +# A tibble: 105,684 × 6 |
| 195 | + MCSID cnum ECWTCMA00 sweep CHTCM00 CWTCM00 |
| 196 | + <chr> <dbl+lbl> <dbl+lbl> <chr> <dbl+lbl> <dbl+l> |
| 197 | + 1 M10001N 1 [1st Cohort Member of the family] NA B 97 NA |
| 198 | + 2 M10001N 1 [1st Cohort Member of the family] NA C 114. 21.2 |
| 199 | + 3 M10001N 1 [1st Cohort Member of the family] NA D 128. 25.5 |
| 200 | + 4 M10001N 1 [1st Cohort Member of the family] NA E NA NA |
| 201 | + 5 M10001N 1 [1st Cohort Member of the family] NA F NA NA |
| 202 | + 6 M10001N 1 [1st Cohort Member of the family] NA G NA NA |
| 203 | + 7 M10002P 1 [1st Cohort Member of the family] 41.8 B 96 NA |
| 204 | + 8 M10002P 1 [1st Cohort Member of the family] 41.8 C 110. 19.2 |
| 205 | + 9 M10002P 1 [1st Cohort Member of the family] 41.8 D 123 26.2 |
| 206 | +10 M10002P 1 [1st Cohort Member of the family] 41.8 E 144. NA |
| 207 | +# ℹ 105,674 more rows |
| 208 | +``` |
| 209 | + |
| 210 | +# Reshaping from Long to Wide |
| 211 | + |
| 212 | +We can also reshape the data from long to wide format using the |
| 213 | +`pivot_wider()` function. In this case, we want to create two new |
| 214 | +columns for each sweep: one for height and one for weight. We specify |
| 215 | +the columns to be reshaped using the `values_from` argument, provide the |
| 216 | +new column names in the `names_from` argument, and use the `names_glue` |
| 217 | +argument to specify the new column names. The `names_glue` argument uses |
| 218 | +curly braces (`{}`) to reference the values from the `names_from` and |
| 219 | +`.value` arguments. As we are specifying multiple columns in |
| 220 | +`values_from`, `.value` is a placeholder for the variable name. |
| 221 | + |
| 222 | +```r |
| 223 | +df_long %>% |
| 224 | + pivot_wider(names_from = sweep, |
| 225 | + values_from = matches("C(W|H)T"), |
| 226 | + names_glue = "{sweep}{.value}") |
| 227 | +``` |
| 228 | + |
| 229 | +``` text |
| 230 | +# A tibble: 17,614 × 20 |
| 231 | + MCSID cnum BECWTCMA00 CECWTCMA00 DECWTCMA00 EECWTCMA00 FECWTCMA00 |
| 232 | + <chr> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> |
| 233 | + 1 M10001N 1 [1st Cohort… NA NA NA NA NA |
| 234 | + 2 M10002P 1 [1st Cohort… 41.8 41.8 41.8 41.8 41.8 |
| 235 | + 3 M10007U 1 [1st Cohort… 40.6 40.6 40.6 40.6 40.6 |
| 236 | + 4 M10008V 1 [1st Cohort… NA NA NA NA NA |
| 237 | + 5 M10008V 2 [2nd Cohort… NA NA NA NA NA |
| 238 | + 6 M10011Q 1 [1st Cohort… 74 74 74 74 74 |
| 239 | + 7 M10014T 1 [1st Cohort… NA NA NA NA NA |
| 240 | + 8 M10015U 1 [1st Cohort… 38.2 38.2 38.2 38.2 38.2 |
| 241 | + 9 M10016V 1 [1st Cohort… 41.5 41.5 41.5 41.5 41.5 |
| 242 | +10 M10017W 1 [1st Cohort… NA NA NA NA NA |
| 243 | +# ℹ 17,604 more rows |
| 244 | +# ℹ 13 more variables: GECWTCMA00 <dbl+lbl>, BCHTCM00 <dbl+lbl>, |
| 245 | +# CCHTCM00 <dbl+lbl>, DCHTCM00 <dbl+lbl>, ECHTCM00 <dbl+lbl>, |
| 246 | +# FCHTCM00 <dbl+lbl>, GCHTCM00 <dbl+lbl>, BCWTCM00 <dbl+lbl>, |
| 247 | +# CCWTCM00 <dbl+lbl>, DCWTCM00 <dbl+lbl>, ECWTCM00 <dbl+lbl>, |
| 248 | +# FCWTCM00 <dbl+lbl>, GCWTCM00 <dbl+lbl> |
| 249 | +``` |
| 250 | + |
| 251 | +# Reshape a Cleaned Dataset from Long to Wide |
| 252 | + |
| 253 | +It is likely that you will not just need to reshape raw data, but |
| 254 | +cleaned data too. In the next two sections we offer advice on naming |
| 255 | +variables so that they are easy to select and reshape in long or wide |
| 256 | +formats. First, let’s clean the long dataset by converting the `cnum` |
| 257 | +and `sweep` columns to integers, creating a new column for follow-up |
| 258 | +time, and creating new `height` and `weight` variables that replace |
| 259 | +negative values in the raw height and weight data with `NA` (as well as |
| 260 | +giving these variables more easy-to-understand names). |
| 261 | + |
| 262 | +```r |
| 263 | +df_long_clean <- df_long %>% |
| 264 | + mutate(cnum = as.integer(cnum), |
| 265 | + sweep = match(sweep, LETTERS), |
| 266 | + fup = fups[sweep], |
| 267 | + height = ifelse(CHTCM00 > 0, CHTCM00, NA), |
| 268 | + weight = ifelse(CWTCM00 > 0, CWTCM00, NA)) %>% |
| 269 | + select(MCSID, cnum, fup, height, weight) |
| 270 | +``` |
| 271 | + |
| 272 | +To reshape the clean data from long to wide format, we can use the |
| 273 | +`pivot_wider()` function as before. This time, we specify the columns to |
| 274 | +be reshaped using the `names_from` argument, provide the new column |
| 275 | +names in the `values_from` argument, and use the `names_glue` argument |
| 276 | +to specify the new column names. The `names_glue` argument uses curly |
| 277 | +braces (`{}`) to reference the values from the `names_from` and `.value` |
| 278 | +arguments. As we are specifying multiple columns in `values_from`, |
| 279 | +`.value` is a placeholder for the variable name. |
| 280 | + |
| 281 | +```r |
| 282 | +df_wide_clean <- df_long_clean %>% |
| 283 | + pivot_wider(names_from = fup, |
| 284 | + values_from = c(height, weight), |
| 285 | + names_glue = "{.value}_{fup}y") |
| 286 | + |
| 287 | +df_wide_clean |
| 288 | +``` |
| 289 | + |
| 290 | +``` text |
| 291 | +# A tibble: 17,614 × 14 |
| 292 | + MCSID cnum height_3y height_5y height_7y height_11y height_14y height_17y |
| 293 | + <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> |
| 294 | + 1 M10001N 1 97 114. 128. NA NA NA |
| 295 | + 2 M10002P 1 96 110. 123 144. 163. 174. |
| 296 | + 3 M10007U 1 102 118 129 154. 174. 181. |
| 297 | + 4 M10008V 1 NA NA NA NA NA NA |
| 298 | + 5 M10008V 2 NA NA NA NA NA NA |
| 299 | + 6 M10011Q 1 106 121 137 168. NA NA |
| 300 | + 7 M10014T 1 97 NA NA NA NA NA |
| 301 | + 8 M10015U 1 94 110. 122. 143 164. 169 |
| 302 | + 9 M10016V 1 102 118. 130 152. 167 185. |
| 303 | +10 M10017W 1 99 110. 121. NA NA NA |
| 304 | +# ℹ 17,604 more rows |
| 305 | +# ℹ 6 more variables: weight_3y <dbl>, weight_5y <dbl>, weight_7y <dbl>, |
| 306 | +# weight_11y <dbl>, weight_14y <dbl>, weight_17y <dbl> |
| 307 | +``` |
| 308 | + |
| 309 | +# Reshape a Cleaned Dataset from Long to Wide |
| 310 | + |
| 311 | +Finally, we can reshape the clean wide dataset back to long format using |
| 312 | +the `pivot_longer()` function. We specify the columns to be reshaped |
| 313 | +using the `cols` argument, provide the new variable names in the |
| 314 | +`names_to` argument, and the pattern the existing column names take |
| 315 | +using the `names_pattern` argument. For `names_pattern` we specify |
| 316 | +`"(.*)_(.*)y"`, which breaks the column name into two pieces: the |
| 317 | +variable name (`"(.*)"`), and the follow-up time (`"(.*)y"`). We also |
| 318 | +use the `names_transform` argument to convert the follow-up time to an |
| 319 | +integer. |
| 320 | + |
| 321 | +```r |
| 322 | +df_wide_clean %>% |
| 323 | + pivot_longer(cols = matches("_.*y$"), |
| 324 | + names_to = c(".value", "fup"), |
| 325 | + names_pattern = "(.*)_(\\d+)y$", |
| 326 | + names_transform = list(fup = as.integer)) |
| 327 | +``` |
| 328 | + |
| 329 | +``` text |
| 330 | +# A tibble: 105,684 × 5 |
| 331 | + MCSID cnum fup height weight |
| 332 | + <chr> <int> <int> <dbl> <dbl> |
| 333 | + 1 M10001N 1 3 97 NA |
| 334 | + 2 M10001N 1 5 114. 21.2 |
| 335 | + 3 M10001N 1 7 128. 25.5 |
| 336 | + 4 M10001N 1 11 NA NA |
| 337 | + 5 M10001N 1 14 NA NA |
| 338 | + 6 M10001N 1 17 NA NA |
| 339 | + 7 M10002P 1 3 96 NA |
| 340 | + 8 M10002P 1 5 110. 19.2 |
| 341 | + 9 M10002P 1 7 123 26.2 |
| 342 | +10 M10002P 1 11 144. NA |
| 343 | +# ℹ 105,674 more rows |
| 344 | +``` |
0 commit comments