Skip to content
Merged
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

* `fill()` (@markfairbanks, #197)

* `pivot_longer()` (@markfairbanks, #204)

* `replace_na()` (@markfairbanks, #202)

# dtplyr 1.1.0
Expand Down
388 changes: 388 additions & 0 deletions R/step-call-pivot_longer.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,388 @@
#' Pivot data from wide to long
#'
#' @description
#' This is a method for the tidyr `pivot_longer()` generic. It is translated to
#' [data.table::melt()]
#'
#' @param data A [lazy_dt()].
#' @inheritParams tidyr::pivot_longer
#' @examples
#' library(tidyr)
#'
#' # Simplest case where column names are character data
#' relig_income_dt <- lazy_dt(relig_income)
#' relig_income_dt %>%
#' pivot_longer(!religion, names_to = "income", values_to = "count")
#'
#' # Slightly more complex case where columns have common prefix,
#' # and missing missings are structural so should be dropped.
#' billboard_dt <- lazy_dt(billboard)
#' billboard %>%
#' pivot_longer(
#' cols = starts_with("wk"),
#' names_to = "week",
#' names_prefix = "wk",
#' values_to = "rank",
#' values_drop_na = TRUE
#' )
#'
#' # Multiple variables stored in column names
#' lazy_dt(who) %>%
#' pivot_longer(
#' cols = new_sp_m014:newrel_f65,
#' names_to = c("diagnosis", "gender", "age"),
#' names_pattern = "new_?(.*)_(.)(.*)",
#' values_to = "count"
#' )
#'
#' # Multiple observations per row
#' anscombe_dt <- lazy_dt(anscombe)
#' anscombe_dt %>%
#' pivot_longer(
#' everything(),
#' names_to = c(".value", "set"),
#' names_pattern = "(.)(.)"
#' )
# exported onLoad
pivot_longer.dtplyr_step <- function(data,
cols,
names_to = "name",
names_prefix = NULL,
names_sep = NULL,
names_pattern = NULL,
names_ptypes = list(),
names_transform = list(),
names_repair = "check_unique",
values_to = "value",
values_drop_na = FALSE,
values_ptypes = list(),
values_transform = list(),
...) {

sim_data <- simulate_vars(data)
sim_vars <- names(sim_data)
measure_vars <- names(tidyselect::eval_select(enquo(cols), sim_data))
if (length(measure_vars) == 0) {
abort("`cols` must select at least one column.")
}

multiple_names_to <- length(names_to) > 1
uses_dot_value <- ".value" %in% names_to

variable_name <- "variable"

if (uses_dot_value) {
if (!is.null(names_sep)) {
.value <- str_separate(measure_vars, into = names_to, sep = names_sep)$.value
} else if (!is.null(names_pattern)) {
.value <- str_extract(measure_vars, into = names_to, names_pattern)$.value
} else {
abort("If you use '.value' in `names_to` you must also supply
`names_sep' or `names_pattern")
}

v_fct <- factor(.value, levels = unique(.value))
measure_vars <- split(measure_vars, v_fct)
values_to <- names(measure_vars)
names(measure_vars) <- NULL

if (multiple_names_to) {
variable_name <- names_to[!names_to == ".value"]
}
} else if (multiple_names_to) {
if (is.null(names_sep) && is.null(names_pattern)) {
abort("If you supply multiple names in `names_to` you must also
supply `names_sep` or `names_pattern`")
} else if (!is.null(names_sep) && !is.null(names_pattern)) {
abort("only one of names_sep or names_pattern should be provided")
}
} else {
variable_name <- names_to
}

args <- list(
measure.vars = measure_vars,
variable.name = variable_name,
value.name = values_to,
na.rm = values_drop_na,
variable.factor = FALSE
)

# Clean up call args if defaults are used
if (variable_name == "variable") {
args$variable.name <- NULL
}

if (length(values_to) == 1) {
if (values_to == "value") {
args$value.name <- NULL
}
}

if (is_false(values_drop_na)) {
args$na.rm <- NULL
}

id_vars <- sim_vars[!sim_vars %in% unlist(measure_vars)]

out <- step_call(
data,
"melt",
args = args,
vars = c(id_vars, variable_name, values_to)
)

if (!is.null(names_prefix)) {
out <- mutate(out, !!variable_name := gsub(paste0("^", names_prefix), "", !!sym(variable_name)))
}

if (multiple_names_to && !uses_dot_value) {
if (!is.null(names_sep)) {
into_cols <- str_separate(pull(out, !!sym(variable_name)), names_to, sep = names_sep)
} else {
into_cols <- str_extract(pull(out, !!sym(variable_name)), into = names_to, regex = names_pattern)
}
out <- mutate(out, !!!into_cols)

# Need to drop variable_name and move names_to vars to correct position
# Recreates relocate logic so only select is necessary, not relocate + select
out_vars <- out$vars
var_idx <- which(out_vars == variable_name)
before_vars <- out_vars[seq_along(out_vars) < var_idx]
after_vars <- out_vars[seq_along(out_vars) > var_idx]

out <- select(out, !!!syms(before_vars), !!!syms(names_to), !!!syms(after_vars))
} else if (!multiple_names_to && uses_dot_value) {
out <- mutate(out, variable = NULL)
}

out <- step_repair(out, repair = names_repair)

## names_ptype & names_transform
cast_vars <- intersect(names_to, names(names_ptypes))
if (length(cast_vars) > 0) {
cast_calls <- vector("list", length(cast_vars))
names(cast_calls) <- cast_vars
for (i in seq_along(cast_calls)) {
cast_calls[[i]] <- call2("vec_cast", sym(cast_vars[[i]]), names_ptypes[[i]])
}
out <- mutate(out, !!!cast_calls)
}

coerce_vars <- intersect(names_to, names(names_transform))
if (length(coerce_vars) > 0) {
coerce_calls <- vector("list", length(coerce_vars))
names(coerce_calls) <- coerce_vars
for (i in seq_along(coerce_calls)) {
.fn <- as_function(names_transform[[i]])
coerce_calls[[i]] <- call2(.fn, sym(coerce_vars[[i]]))
}
out <- mutate(out, !!!coerce_calls)
}

## values_ptype & values_transform
cast_vars <- intersect(values_to, names(values_ptypes))
if (length(cast_vars) > 0) {
cast_calls <- vector("list", length(cast_vars))
names(cast_calls) <- cast_vars
for (i in seq_along(cast_calls)) {
cast_calls[[i]] <- call2(expr(vec_cast), sym(cast_vars[[i]]), values_ptypes[[i]])
}
out <- mutate(out, !!!cast_calls)
}

coerce_vars <- intersect(values_to, names(values_transform))
if (length(coerce_vars) > 0) {
coerce_calls <- vector("list", length(coerce_vars))
names(coerce_calls) <- coerce_vars
for (i in seq_along(coerce_calls)) {
.fn <- as_function(values_transform[[i]])
coerce_calls[[i]] <- call2(.fn, sym(coerce_vars[[i]]))
}
out <- mutate(out, !!!coerce_calls)
}

out
}

# exported onLoad
pivot_longer.data.table <- function(data,
cols,
names_to = "name",
names_prefix = NULL,
names_sep = NULL,
names_pattern = NULL,
names_ptypes = list(),
names_transform = list(),
names_repair = "check_unique",
values_to = "value",
values_drop_na = FALSE,
values_ptypes = list(),
values_transform = list(),
...) {
data <- lazy_dt(data)
tidyr::pivot_longer(
data = data,
cols = {{ cols }},
names_to = names_to,
names_prefix = names_prefix,
names_sep = names_sep,
names_pattern = names_pattern,
names_ptypes = names_ptypes,
names_transform = names_transform,
names_repair = names_repair,
values_to = values_to,
values_drop_na = values_drop_na,
values_ptypes = values_ptypes,
values_transform = values_transform,
...
)
}

# str_extract() -----------------------------------------------------------------
str_extract <- function(x, into, regex, convert = FALSE) {
stopifnot(
is_string(regex),
is_character(into)
)

out <- str_match_first(x, regex)
if (length(out) != length(into)) {
stop(
"`regex` should define ", length(into), " groups; ", ncol(out), " found.",
call. = FALSE
)
}

# Handle duplicated names
if (anyDuplicated(into)) {
pieces <- split(out, into)
into <- names(pieces)
out <- lapply(pieces, pmap_chr, paste0, sep = "")
}

into <- as_utf8_character(into)

non_na_into <- !is.na(into)
out <- out[non_na_into]
names(out) <- into[non_na_into]

if (convert) {
out[] <- lapply(out, utils::type.convert, as.is = TRUE)
}

out
}

str_match_first <- function(string, regex) {
loc <- regexpr(regex, string, perl = TRUE)
loc <- group_loc(loc)

out <- lapply(
seq_len(loc$matches),
function(i) substr(string, loc$start[, i], loc$end[, i])
)
out[-1]
}

group_loc <- function(x) {
start <- cbind(as.vector(x), attr(x, "capture.start"))
end <- start + cbind(attr(x, "match.length"), attr(x, "capture.length")) - 1L

no_match <- start == -1L
start[no_match] <- NA
end[no_match] <- NA

list(matches = ncol(start), start = start, end = end)
}

# str_separate() -----------------------------------------------------------------

str_separate <- function(x, into, sep, convert = FALSE, extra = "warn", fill = "warn") {
if (!is.character(into)) {
abort("`into` must be a character vector")
}

if (is.numeric(sep)) {
out <- strsep(x, sep)
} else if (is_character(sep)) {
out <- data.table::tstrsplit(x, sep, fixed = TRUE, names = TRUE)
out <- as_tibble(out)
} else {
abort("`sep` must be either numeric or character")
}

names(out) <- as_utf8_character(into)
out <- out[!is.na(names(out))]
if (convert) {
out[] <- lapply(out, utils::type.convert, as.is = TRUE)
}
out
}

strsep <- function(x, sep) {
nchar <- nchar(x)
pos <- lapply(sep, function(i) {
if (i >= 0) return(i)
pmax(0, nchar + i)
})
pos <- c(list(0), pos, list(nchar))

lapply(1:(length(pos) - 1), function(i) {
substr(x, pos[[i]] + 1, pos[[i + 1]])
})
}

str_split_n <- function(x, pattern, n_max = -1) {
if (is.factor(x)) {
x <- as.character(x)
}
m <- gregexpr(pattern, x, perl = TRUE)
if (n_max > 0) {
m <- lapply(m, function(x) slice_match(x, seq_along(x) < n_max))
}
regmatches(x, m, invert = TRUE)
}

slice_match <- function(x, i) {
structure(
x[i],
match.length = attr(x, "match.length")[i],
index.type = attr(x, "index.type"),
useBytes = attr(x, "useBytes")
)
}

list_indices <- function(x, max = 20) {
if (length(x) > max) {
x <- c(x[seq_len(max)], "...")
}

paste(x, collapse = ", ")
}

# pmap()/pmap_chr() -----------------------------------------------------------------

args_recycle <- function(args) {
lengths <- vapply(args, length, integer(1))
n <- max(lengths)

stopifnot(all(lengths == 1L | lengths == n))
to_recycle <- lengths == 1L
args[to_recycle] <- lapply(args[to_recycle], function(x) rep.int(x, n))

args
}

pmap <- function(.l, .f, ...) {
args <- args_recycle(.l)
do.call("mapply", c(
FUN = list(quote(.f)),
args, MoreArgs = quote(list(...)),
SIMPLIFY = FALSE, USE.NAMES = FALSE
))
}

pmap_chr <- function(.l, .f, ...) {
as.character(pmap(.l, .f, ...))
}
Loading