Skip to content

Allow custom age groups sizes in create_age_groups#169

Draft
Moohan wants to merge 4 commits intomasterfrom
93-create_age_groups-with-bins-of-custom-lengths
Draft

Allow custom age groups sizes in create_age_groups#169
Moohan wants to merge 4 commits intomasterfrom
93-create_age_groups-with-bins-of-custom-lengths

Conversation

@Moohan
Copy link
Copy Markdown
Member

@Moohan Moohan commented Feb 18, 2026

I've just had a first go at this, the logic and messages might need fixing up a bit and the tests will definitiely need adding to / changing to expect the deprecation warning messages.

I implemented it as discussed in #93.

I think it would make sense to also implement @Nic-Chr suggestions from #135 in the same PR?

@Moohan Moohan linked an issue Feb 18, 2026 that may be closed by this pull request
@Moohan Moohan requested review from Nic-Chr and Tina815 February 18, 2026 17:00
Comment on lines +45 to +46
breaks = seq(0, 90, 5),
as_factor = FALSE
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
breaks = seq(0, 90, 5),
as_factor = FALSE
as_factor = FALSE,
breaks = seq(0, 90, 5)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add checks to theses as_factor needs to be a logical and breaks must be a numeric vector.

Comment on lines +49 to +51
lifecycle::is_present(from) ||
lifecycle::is_present(to) ||
lifecycle::is_present(by)
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use !missing(...) instead of lifecycle::is_present (check this is the same).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked and the overhead of lifecycle::is_present is very small anyway so we can keep it I think.

@Nic-Chr
Copy link
Copy Markdown
Contributor

Nic-Chr commented Feb 26, 2026

As mentioned in #135 we can safely use .bincode() instead of cut() to improve performance.

create_age_groups1 <- function(
    x,
    from = lifecycle::deprecated(),
    to = lifecycle::deprecated(),
    by = lifecycle::deprecated(),
    as_factor = FALSE,
    breaks = seq(0, 90, 5)
) {
  if (
    lifecycle::is_present(from) ||
    lifecycle::is_present(to) ||
    lifecycle::is_present(by)
  ) {
    if (missing(breaks)) {
      lifecycle::deprecate_soft(
        when = "1.0.0",
        what = I("create_age_groups(from/to/by)"),
        with = "create_age_groups(breaks)"
      )
      
      # Fall back to defaults for any missing deprecated arguments
      if (!lifecycle::is_present(from)) from <- 0
      if (!lifecycle::is_present(to)) to <- 90
      if (!lifecycle::is_present(by)) by <- 5
      
      breaks <- seq(from, to, by)
    } else {
      cli::cli_abort(
        "{.arg from}, {.arg to} or {.arg by} should no longer be used, use {.arg breaks} only"
      )
    }
  }
  
  breaks <- c(breaks, Inf)
  breaks <- sort(unique(breaks))
  
  # Create labels based on consecutive values in breaks
  labels <- paste0(utils::head(breaks, -1), "-", utils::tail(breaks, -1) - 1)
  
  # Reformat label for last value
  labels <- gsub("-Inf", "+", labels)
  
  agegroup <- cut(
    x,
    breaks = breaks,
    labels = labels,
    right = FALSE,
    ordered_result = TRUE
  )
  
  if (as_factor == FALSE) {
    agegroup <- as.character(agegroup)
  }
  
  agegroup
}
create_age_groups2 <- function(
    x,
    from = lifecycle::deprecated(),
    to = lifecycle::deprecated(),
    by = lifecycle::deprecated(),
    as_factor = FALSE,
    breaks = seq(0, 90, 5)
) {
  if (
    lifecycle::is_present(from) ||
    lifecycle::is_present(to) ||
    lifecycle::is_present(by)
  ) {
    if (missing(breaks)) {
      lifecycle::deprecate_soft(
        when = "1.0.0",
        what = I("create_age_groups(from/to/by)"),
        with = "create_age_groups(breaks)"
      )
      
      # Fall back to defaults for any missing deprecated arguments
      if (!lifecycle::is_present(from)) from <- 0
      if (!lifecycle::is_present(to)) to <- 90
      if (!lifecycle::is_present(by)) by <- 5
      
      breaks <- seq(from, to, by)
    } else {
      cli::cli_abort(
        "{.arg from}, {.arg to} or {.arg by} should no longer be used, use {.arg breaks} only"
      )
    }
  }
  
  
  n_breaks <- length(breaks)
  n_intervals <- max(n_breaks - 1L, 0L)
  
  age_bands <- paste0(
    breaks[seq_len(n_intervals)], "-",
    breaks[seq(to = n_breaks, length.out = n_intervals)] - 1L
  )
  rightmost_band <- paste0(breaks[n_breaks], "+")
  age_bands[n_breaks] <- rightmost_band
  
  factor_codes <- .bincode(x, breaks = c(breaks, Inf), right = FALSE)
  
  if (as_factor){
    out <- factor_codes
    levels(out) <- age_bands
    class(out) <- c("ordered", "factor")
  } else {
    out <- age_bands[factor_codes]
  }
  out
}

x <- sample(0:100, 10^6, TRUE)

library(bench)
mark(
  with_cut = create_age_groups1(x, as_factor = FALSE),
  with_bincode = create_age_groups2(x, as_factor = FALSE)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 with_cut       46.1ms     48ms      19.9    30.8MB     21.9
#> 2 with_bincode   37.1ms   42.4ms      21.1    19.3MB     17.3

mark(
  with_cut = create_age_groups1(x, as_factor = TRUE),
  with_bincode = create_age_groups2(x, as_factor = TRUE)
)
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 with_cut       36.6ms   36.6ms      27.3    22.9MB    150. 
#> 2 with_bincode   32.1ms   32.3ms      30.3    15.3MB     45.4

# Overhead on small vectors
y <- 1:10

mark(
  with_cut = create_age_groups1(y, as_factor = FALSE),
  with_bincode = create_age_groups2(y, as_factor = FALSE)
)
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 with_cut      162.8µs  169.3µs     5719.    3.94KB    10.3 
#> 2 with_bincode   78.8µs   82.1µs    11370.    1.73KB     8.22

mark(
  with_cut = create_age_groups1(y, as_factor = TRUE),
  with_bincode = create_age_groups2(y, as_factor = TRUE)
)
#> # A tibble: 2 × 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 with_cut      158.9µs  167.1µs     5517.    3.94KB     10.4
#> 2 with_bincode   80.5µs   83.8µs    11519.    2.02KB     10.4

Created on 2026-02-26 with reprex v2.1.1

Note: I replaced the use of utils::tail and utils::head to improve overhead.

The benchmarks show slight improvement in speed, fair improvement on memory usage. Given how easy of an implementation it is with essentially no drawbacks, it might be worth implementing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

create_age_groups with bins of custom lengths

2 participants