|
| 1 | +#' Splits a dataframe into two, such that it could be reassembled with a |
| 2 | +#' mutating join |
| 3 | +#' |
| 4 | +#' This function takes an arbitrary number of 'joining' columns and any number |
| 5 | +#' of additional column names and splits a dataframe in two such that a user |
| 6 | +#' could then re-join using [merge()] or [dplyr::left_join()]. The user may find |
| 7 | +#' it appropriate to go on and apply [messy()] to each new dataframe |
| 8 | +#' independently to impede rejoining. |
| 9 | +#' |
| 10 | +#' Real data is often found across multiple datasets. For example, in |
| 11 | +#' environmental monitoring, measurements at a monitoring station may need to be |
| 12 | +#' bound with metadata about the station such as geographic coordinates, or even |
| 13 | +#' meteorological data from an external source, to produce desired outputs. In |
| 14 | +#' clinical research it may be necessary to combine the results of a clinical |
| 15 | +#' trial with relevant patient information, such as weight or sex. This function |
| 16 | +#' undoes existing joins to present learners with an authentic problem to solve; |
| 17 | +#' joining two independent datasets to achieve some goal. |
| 18 | +#' |
| 19 | +#' @param data input dataframe |
| 20 | +#' @param by a vector of column names which will be present in both outputs, to |
| 21 | +#' rejoin the dataframes |
| 22 | +#' @param cols specific columns to be present in the 'right' dataframe. |
| 23 | +#' implicitly, all other columns not in 'cols' will be present in the 'left' |
| 24 | +#' dataframe. |
| 25 | +#' @param distinct Apply [dplyr::distinct()] to `"both"` dataframes, the |
| 26 | +#' `"left"` or `"right"` dataframes, or `"none"` of the dataframes. This may |
| 27 | +#' be useful if one table is a 'lookup' or metadata table that has its values |
| 28 | +#' repeated many times in `data`. |
| 29 | +#' @param names The names of the output list. If `NULL` the list will be |
| 30 | +#' unnamed. |
| 31 | +#' |
| 32 | +#' @returns A list of two dataframes |
| 33 | +#' |
| 34 | +#' @examples |
| 35 | +#' dummy <- |
| 36 | +#' dplyr::tibble( |
| 37 | +#' patient_id = c(1, 1, 1, 2, 2, 2, 3, 3, 3), |
| 38 | +#' test = c(1, 2, 3, 1, 2, 3, 1, 2, 3), |
| 39 | +#' result = c("++", "+", "-", "--", "+", "-", "+", "++", "-"), |
| 40 | +#' sex = c("M", "M", "M", "M", "M", "M", "F", "F", "F"), |
| 41 | +#' age = c(50, 50, 50, 25, 25, 25, 30, 30, 30) |
| 42 | +#' ) |
| 43 | +#' |
| 44 | +#' unjoin( |
| 45 | +#' dummy, |
| 46 | +#' by = "patient_id", |
| 47 | +#' cols = c("sex", "age"), |
| 48 | +#' distinct = "right", |
| 49 | +#' names = c("tests", "patient_info") |
| 50 | +#' ) |
| 51 | +#' |
| 52 | +#' @author Jack Davison |
| 53 | +#' @family data deconstructors |
| 54 | +#' @export |
| 55 | +unjoin <- function(data, |
| 56 | + by, |
| 57 | + cols, |
| 58 | + distinct = "none", |
| 59 | + names = c("left", "right")) { |
| 60 | + if (!any(cols %in% names(data))) { |
| 61 | + stop("Not all of 'cols' are in 'data' names.") |
| 62 | + } |
| 63 | + |
| 64 | + distinct <- match.arg(distinct, c("both", "right", "left", "none")) |
| 65 | + |
| 66 | + x_names <- c(by, names(data)[!names(data) %in% cols]) |
| 67 | + y_names <- c(by, cols) |
| 68 | + |
| 69 | + x <- dplyr::select(data, dplyr::all_of(x_names)) |
| 70 | + y <- dplyr::select(data, dplyr::all_of(y_names)) |
| 71 | + |
| 72 | + if (distinct %in% c("both", "left")) { |
| 73 | + x <- dplyr::distinct(x) |
| 74 | + } |
| 75 | + if (distinct %in% c("both", "right")) { |
| 76 | + y <- dplyr::distinct(y) |
| 77 | + } |
| 78 | + |
| 79 | + out <- list(x, y) |
| 80 | + |
| 81 | + if (!is.null(names)) { |
| 82 | + if (length(names) != 2L) { |
| 83 | + stop("'names' should 'NULL' or a vector of length 2.") |
| 84 | + } |
| 85 | + out <- stats::setNames(out, names) |
| 86 | + } |
| 87 | + |
| 88 | + return(out) |
| 89 | +} |
| 90 | + |
| 91 | +#' Splits a dataframe row-wise or col-wise into any arbitrary number of |
| 92 | +#' dataframes |
| 93 | +#' |
| 94 | +#' This function splits a dataframe into any number of dataframes such that they |
| 95 | +#' can be rejoined by using [rbind()]/[dplyr::bind_rows()] for [unrbind()] or |
| 96 | +#' [cbind()]/[dplyr::bind_cols()] for [uncbind()]. The user may find it |
| 97 | +#' appropriate to go on and apply [messy()] to each new dataframe independently |
| 98 | +#' to impede rejoining. |
| 99 | +#' |
| 100 | +#' Real data can often be found in disparate files. For example, data reports |
| 101 | +#' may come in monthly and require row-binding together to obtain a complete |
| 102 | +#' annual time series. Scientific results may arrive from different laboratories |
| 103 | +#' and require binding together for further analysis and comparisons. This |
| 104 | +#' function may simulate a single dataframe having come from different sources |
| 105 | +#' and requiring binding back together. Base R's [split()] offers an alternative |
| 106 | +#' to [unrbind()], but requires a pre-existing factor column to split by and |
| 107 | +#' cannot as easily create random splits in the data. |
| 108 | +#' |
| 109 | +#' @inheritParams unjoin |
| 110 | +#' @param sizes A vector of numeric inputs summing to `nrow(data)` for |
| 111 | +#' [unrbind()] or `ncol(data)` for [uncbind()]; the number of rows of each |
| 112 | +#' resulting dataframe. See `probs` for an alternative approach. If neither |
| 113 | +#' are provided, the dataframe will be split roughly in half. |
| 114 | +#' @param probs A vector of numeric inputs summing to `1`; the proportion of |
| 115 | +#' rows/columns in each resulting dataframe. An alternative to `sizes`. |
| 116 | +#' @param shuffle Shuffle rows in [unrbind()] or columns in [uncbind()]? |
| 117 | +#' Defaults to `TRUE`. |
| 118 | +#' |
| 119 | +#' @returns A list of dataframes |
| 120 | +#' |
| 121 | +#' @rdname unrbind |
| 122 | +#' @order 1 |
| 123 | +#' |
| 124 | +#' @author Jack Davison |
| 125 | +#' @family data deconstructors |
| 126 | +#' @export |
| 127 | +#' |
| 128 | +#' @examples |
| 129 | +#' unrbind(dplyr::tibble(mtcars), probs = c(0.5, 0.3, 0.2)) |
| 130 | +#' |
| 131 | +#' uncbind(dplyr::tibble(mtcars), probs = c(0.5, 0.3, 0.2)) |
| 132 | +unrbind <- function(data, |
| 133 | + sizes = NULL, |
| 134 | + probs = NULL, |
| 135 | + names = NULL, |
| 136 | + shuffle = TRUE) { |
| 137 | + if (is.null(sizes) & is.null(probs)) { |
| 138 | + half <- round(nrow(data) / 2) |
| 139 | + sizes <- c(half, half) |
| 140 | + } else if (is.null(sizes)) { |
| 141 | + if (sum(probs) != 1) { |
| 142 | + stop("'probs' must sum to 1") |
| 143 | + } |
| 144 | + # Convert probs to row counts |
| 145 | + sizes <- round(probs * nrow(data)) |
| 146 | + } else { |
| 147 | + if (sum(sizes) != nrow(data)) { |
| 148 | + stop("'sizes' must sum to ", nrow(data)) |
| 149 | + } |
| 150 | + } |
| 151 | + |
| 152 | + # Shuffle row indices |
| 153 | + if (shuffle) { |
| 154 | + shuffled_rows <- sample(nrow(data)) |
| 155 | + data <- data[shuffled_rows, ] |
| 156 | + } |
| 157 | + |
| 158 | + # Adjust to ensure the total matches nrow(data) due to rounding issues |
| 159 | + diff <- nrow(data) - sum(sizes) |
| 160 | + if (diff != 0) { |
| 161 | + max_index <- which.max(sizes) # Adjust the largest group |
| 162 | + sizes[max_index] <- sizes[max_index] + diff |
| 163 | + } |
| 164 | + |
| 165 | + # assign groups based on sizes |
| 166 | + groups <- rep(seq_along(sizes), times = sizes) |
| 167 | + |
| 168 | + # split dataframe |
| 169 | + split_data <- split(data, groups) |
| 170 | + |
| 171 | + # names |
| 172 | + if (!is.null(names)) { |
| 173 | + if (length(names) != length(split_data)) { |
| 174 | + stop("The number of names must equal the number of output dataframes") |
| 175 | + } |
| 176 | + split_data <- stats::setNames(split_data, names) |
| 177 | + } else { |
| 178 | + split_data <- unname(split_data) |
| 179 | + } |
| 180 | + |
| 181 | + # returns a list of dataframes |
| 182 | + return(split_data) |
| 183 | +} |
| 184 | + |
| 185 | +#' @rdname unrbind |
| 186 | +#' @order 2 |
| 187 | +#' @export |
| 188 | +uncbind <- function(data, |
| 189 | + sizes = NULL, |
| 190 | + probs = NULL, |
| 191 | + names = NULL, |
| 192 | + shuffle = TRUE) { |
| 193 | + if (is.null(sizes) & is.null(probs)) { |
| 194 | + half <- round(ncol(data) / 2) |
| 195 | + sizes <- c(half, half) |
| 196 | + } else if (is.null(sizes)) { |
| 197 | + if (sum(probs) != 1) { |
| 198 | + stop("'probs' must sum to 1") |
| 199 | + } |
| 200 | + # Convert probs to row counts |
| 201 | + sizes <- round(probs * ncol(data)) |
| 202 | + } else { |
| 203 | + if (sum(sizes) != ncol(data)) { |
| 204 | + stop("'sizes' must sum to ", ncol(data)) |
| 205 | + } |
| 206 | + } |
| 207 | + |
| 208 | + # Shuffle col indices |
| 209 | + if (shuffle) { |
| 210 | + shuffled_cols <- sample(ncol(data)) |
| 211 | + data <- data[, shuffled_cols] |
| 212 | + } |
| 213 | + |
| 214 | + # Adjust to ensure the total matches ncol(data) due to rounding issues |
| 215 | + diff <- ncol(data) - sum(sizes) |
| 216 | + if (diff != 0) { |
| 217 | + min_index <- which.min(sizes) # Adjust the largest group |
| 218 | + sizes[min_index] <- sizes[min_index] + diff |
| 219 | + } |
| 220 | + |
| 221 | + # assign groups based on sizes |
| 222 | + groups <- rep(seq_along(sizes), times = sizes) |
| 223 | + |
| 224 | + # split dataframe |
| 225 | + ends <- cumsum(sizes) |
| 226 | + starts <- c(1, ends + 1)[-(length(ends) + 1)] |
| 227 | + split_data <- lapply(seq_along(starts), function(x) |
| 228 | + data[, starts[x]:ends[x]]) |
| 229 | + |
| 230 | + # names |
| 231 | + if (!is.null(names)) { |
| 232 | + if (length(names) != length(split_data)) { |
| 233 | + stop("The number of names must equal the number of output dataframes") |
| 234 | + } |
| 235 | + split_data <- stats::setNames(split_data, names) |
| 236 | + } else { |
| 237 | + split_data <- unname(split_data) |
| 238 | + } |
| 239 | + |
| 240 | + # returns a list of dataframes |
| 241 | + return(split_data) |
| 242 | +} |
0 commit comments