Allow custom age groups sizes in create_age_groups#169
Allow custom age groups sizes in create_age_groups#169
create_age_groups#169Conversation
R/create_age_groups.R
Outdated
| breaks = seq(0, 90, 5), | ||
| as_factor = FALSE |
There was a problem hiding this comment.
| breaks = seq(0, 90, 5), | |
| as_factor = FALSE | |
| as_factor = FALSE, | |
| breaks = seq(0, 90, 5) |
There was a problem hiding this comment.
Add checks to theses as_factor needs to be a logical and breaks must be a numeric vector.
| lifecycle::is_present(from) || | ||
| lifecycle::is_present(to) || | ||
| lifecycle::is_present(by) |
There was a problem hiding this comment.
Use !missing(...) instead of lifecycle::is_present (check this is the same).
There was a problem hiding this comment.
I checked and the overhead of lifecycle::is_present is very small anyway so we can keep it I think.
|
As mentioned in #135 we can safely use create_age_groups1 <- function(
x,
from = lifecycle::deprecated(),
to = lifecycle::deprecated(),
by = lifecycle::deprecated(),
as_factor = FALSE,
breaks = seq(0, 90, 5)
) {
if (
lifecycle::is_present(from) ||
lifecycle::is_present(to) ||
lifecycle::is_present(by)
) {
if (missing(breaks)) {
lifecycle::deprecate_soft(
when = "1.0.0",
what = I("create_age_groups(from/to/by)"),
with = "create_age_groups(breaks)"
)
# Fall back to defaults for any missing deprecated arguments
if (!lifecycle::is_present(from)) from <- 0
if (!lifecycle::is_present(to)) to <- 90
if (!lifecycle::is_present(by)) by <- 5
breaks <- seq(from, to, by)
} else {
cli::cli_abort(
"{.arg from}, {.arg to} or {.arg by} should no longer be used, use {.arg breaks} only"
)
}
}
breaks <- c(breaks, Inf)
breaks <- sort(unique(breaks))
# Create labels based on consecutive values in breaks
labels <- paste0(utils::head(breaks, -1), "-", utils::tail(breaks, -1) - 1)
# Reformat label for last value
labels <- gsub("-Inf", "+", labels)
agegroup <- cut(
x,
breaks = breaks,
labels = labels,
right = FALSE,
ordered_result = TRUE
)
if (as_factor == FALSE) {
agegroup <- as.character(agegroup)
}
agegroup
}
create_age_groups2 <- function(
x,
from = lifecycle::deprecated(),
to = lifecycle::deprecated(),
by = lifecycle::deprecated(),
as_factor = FALSE,
breaks = seq(0, 90, 5)
) {
if (
lifecycle::is_present(from) ||
lifecycle::is_present(to) ||
lifecycle::is_present(by)
) {
if (missing(breaks)) {
lifecycle::deprecate_soft(
when = "1.0.0",
what = I("create_age_groups(from/to/by)"),
with = "create_age_groups(breaks)"
)
# Fall back to defaults for any missing deprecated arguments
if (!lifecycle::is_present(from)) from <- 0
if (!lifecycle::is_present(to)) to <- 90
if (!lifecycle::is_present(by)) by <- 5
breaks <- seq(from, to, by)
} else {
cli::cli_abort(
"{.arg from}, {.arg to} or {.arg by} should no longer be used, use {.arg breaks} only"
)
}
}
n_breaks <- length(breaks)
n_intervals <- max(n_breaks - 1L, 0L)
age_bands <- paste0(
breaks[seq_len(n_intervals)], "-",
breaks[seq(to = n_breaks, length.out = n_intervals)] - 1L
)
rightmost_band <- paste0(breaks[n_breaks], "+")
age_bands[n_breaks] <- rightmost_band
factor_codes <- .bincode(x, breaks = c(breaks, Inf), right = FALSE)
if (as_factor){
out <- factor_codes
levels(out) <- age_bands
class(out) <- c("ordered", "factor")
} else {
out <- age_bands[factor_codes]
}
out
}
x <- sample(0:100, 10^6, TRUE)
library(bench)
mark(
with_cut = create_age_groups1(x, as_factor = FALSE),
with_bincode = create_age_groups2(x, as_factor = FALSE)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 with_cut 46.1ms 48ms 19.9 30.8MB 21.9
#> 2 with_bincode 37.1ms 42.4ms 21.1 19.3MB 17.3
mark(
with_cut = create_age_groups1(x, as_factor = TRUE),
with_bincode = create_age_groups2(x, as_factor = TRUE)
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 with_cut 36.6ms 36.6ms 27.3 22.9MB 150.
#> 2 with_bincode 32.1ms 32.3ms 30.3 15.3MB 45.4
# Overhead on small vectors
y <- 1:10
mark(
with_cut = create_age_groups1(y, as_factor = FALSE),
with_bincode = create_age_groups2(y, as_factor = FALSE)
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 with_cut 162.8µs 169.3µs 5719. 3.94KB 10.3
#> 2 with_bincode 78.8µs 82.1µs 11370. 1.73KB 8.22
mark(
with_cut = create_age_groups1(y, as_factor = TRUE),
with_bincode = create_age_groups2(y, as_factor = TRUE)
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 with_cut 158.9µs 167.1µs 5517. 3.94KB 10.4
#> 2 with_bincode 80.5µs 83.8µs 11519. 2.02KB 10.4Created on 2026-02-26 with reprex v2.1.1 Note: I replaced the use of The benchmarks show slight improvement in speed, fair improvement on memory usage. Given how easy of an implementation it is with essentially no drawbacks, it might be worth implementing. |
I've just had a first go at this, the logic and messages might need fixing up a bit and the tests will definitiely need adding to / changing to expect the deprecation warning messages.
I implemented it as discussed in #93.
I think it would make sense to also implement @Nic-Chr suggestions from #135 in the same PR?