|
1 | 1 | # ───────────────────────────────────────────────────────────────────────────── |
2 | | -# BENCHMARK: add NEW3 using dt[, .. := .., by=group] |
| 2 | +# BENCHMARK in pure data.table, no back-and-forth to data.frame |
3 | 3 | # ───────────────────────────────────────────────────────────────────────────── |
4 | 4 |
|
5 | | -library(formula.tools); library(compiler) |
6 | | -library(dplyr); library(purrr) |
7 | | -library(data.table); library(tictoc) |
| 5 | +library(formula.tools) # lhs() / rhs() |
| 6 | +library(compiler) # cmpfun() |
| 7 | +library(purrr) # map(), map2() |
| 8 | +library(data.table) # as.data.table(), .GRP, rbindlist(), setcolorder() |
| 9 | +library(tictoc) # tic()/toc() |
8 | 10 |
|
9 | 11 | # 1) Prepare data ------------------------------------------------------------ |
10 | 12 | set.seed(2025) |
11 | | -n <- 1e7 |
| 13 | +n <- 1e8 |
12 | 14 | ngrp <- 10 |
13 | 15 | df <- data.frame( |
14 | | - price = runif(n, 5, 100), |
15 | | - quality = runif(n, 1, 5), |
| 16 | + price = runif(n, 5, 100), |
| 17 | + quality = runif(n, 1, 5), |
16 | 18 | group = rep(1:ngrp, length.out = n) |
17 | 19 | ) |
18 | | -bprice <- -0.2; bquality <- 0.8 |
| 20 | +bprice <- -0.2 |
| 21 | +bquality <- 0.8 |
| 22 | + |
19 | 23 | single_group <- list( |
20 | 24 | V1 = V1 ~ bprice * price + bquality * quality, |
21 | 25 | V2 = V2 ~ 0 |
22 | 26 | ) |
23 | 27 | utility_list <- rep(list(single_group), ngrp) |
24 | 28 |
|
25 | 29 | # 2) Helpers --------------------------------------------------------------- |
26 | | -by_formula <- function(eq) pick(everything()) %>% transmute(!!lhs(eq) := !!rhs(eq)) |
27 | | -compile_one <- function(fm){ |
28 | | - nm <- as.character(lhs(fm)); rhs <- rhs(fm) |
29 | | - fn <- eval(bquote(function(df) with(df, .(rhs)))) |
| 30 | + |
| 31 | +compile_one <- function(fm) { |
| 32 | + nm <- as.character(lhs(fm)) |
| 33 | + rhs <- rhs(fm) |
| 34 | + fn <- eval(bquote(function(d) with(d, .(rhs)))) |
30 | 35 | list(name = nm, fun = cmpfun(fn)) |
31 | 36 | } |
32 | | -compile_utility_list <- function(u) lapply(u, function(fl){ |
33 | | - tmp <- lapply(fl, compile_one) |
34 | | - setNames(lapply(tmp, `[[`, "fun"), |
35 | | - vapply(tmp, `[[`, "", "name")) |
36 | | -}) |
37 | 37 |
|
38 | | -# 3) Methods -------------------------------------------------------------- |
| 38 | +compile_utility_list <- function(ul) { |
| 39 | + lapply(ul, function(fl) { |
| 40 | + tmp <- lapply(fl, compile_one) |
| 41 | + setNames(lapply(tmp, `[[`, "fun"), |
| 42 | + vapply(tmp, `[[`, "", "name")) |
| 43 | + }) |
| 44 | +} |
| 45 | + |
| 46 | +# 3) Four methods, all returning data.tables ------------------------------- |
39 | 47 |
|
40 | 48 | old_group <- function(data, utility) { |
41 | 49 | tic("OLD-GROUP") |
42 | | - subs <- split(data, data$group) |
43 | | - subs2 <- map2(utility, subs, ~ mutate(.y, map_dfc(.x, by_formula))) |
44 | | - out <- bind_rows(subs2) |
| 50 | + subs <- split(data, data$group) |
| 51 | + out <- map2(utility, subs, ~ { |
| 52 | + dt <- as.data.table(.y) |
| 53 | + for (fm in .x) { |
| 54 | + dt[, as.character(lhs(fm)) := eval(rhs(fm), dt)] |
| 55 | + } |
| 56 | + dt |
| 57 | + }) |
| 58 | + res <- rbindlist(out) |
45 | 59 | toc(log = FALSE) |
46 | | - out |
| 60 | + res |
47 | 61 | } |
48 | 62 |
|
49 | 63 | new1_group <- function(data, utility) { |
50 | 64 | ufuns <- compile_utility_list(utility) |
51 | 65 | tic("NEW1-GROUP") |
52 | 66 | subs <- split(data, data$group) |
53 | | - subs2 <- map2(ufuns, subs, ~ bind_cols(.y, lapply(.x, function(f) f(.y)))) |
54 | | - out <- bind_rows(subs2) |
| 67 | + out <- map2(ufuns, subs, ~ { |
| 68 | + dt <- as.data.table(.y) |
| 69 | + for (nm in names(.x)) dt[, (nm) := .x[[nm]](dt)] |
| 70 | + dt |
| 71 | + }) |
| 72 | + res <- rbindlist(out) |
55 | 73 | toc(log = FALSE) |
56 | | - out |
| 74 | + res |
57 | 75 | } |
58 | 76 |
|
59 | 77 | new2_group <- function(data, utility) { |
60 | | - ufuns <- compile_utility_list(utility) |
61 | | - dt <- setDT(data) |
| 78 | + ufuns <- compile_utility_list(utility) |
| 79 | + dt <- as.data.table(data) |
| 80 | + groups <- sort(unique(dt$group)) |
| 81 | + gm <- setNames(seq_along(groups), groups) |
| 82 | + |
62 | 83 | tic("NEW2-GROUP (loop)") |
63 | | - for (g in seq_along(ufuns)) { |
64 | | - fns <- ufuns[[g]] |
| 84 | + for (g in groups) { |
| 85 | + idx <- gm[as.character(g)] |
| 86 | + fns <- ufuns[[idx]] |
65 | 87 | dt[group == g, (names(fns)) := lapply(fns, function(f) f(.SD))] |
66 | 88 | } |
67 | 89 | toc(log = FALSE) |
68 | | - as.data.frame(dt) |
| 90 | + dt |
69 | 91 | } |
70 | 92 |
|
71 | 93 | new3_group <- function(data, utility) { |
72 | | - ufuns <- compile_utility_list(utility) |
73 | | - dt <- setDT(data) |
74 | | - varnames <- names(ufuns[[1]]) |
75 | | - tic("NEW3-GROUP (by=group)") |
76 | | - dt[, (varnames) := lapply(ufuns[[.BY$group]], function(f) f(.SD)), by = group] |
| 94 | + ufuns <- compile_utility_list(utility) |
| 95 | + dt <- as.data.table(data) |
| 96 | + |
| 97 | + # figure which columns your utilities will actually use |
| 98 | + sdcols <- intersect( |
| 99 | + names(dt), |
| 100 | + unique(unlist(lapply(utility, function(fl) { |
| 101 | + unlist(lapply(fl, function(fm) all.vars(rhs(fm)))) |
| 102 | + }))) |
| 103 | + ) |
| 104 | + varn <- names(ufuns[[1]]) |
| 105 | + |
| 106 | + tic("NEW3-GROUP (single dt[ , by=group ])") |
| 107 | + out <- dt[ |
| 108 | + , |
| 109 | + c(.SD, |
| 110 | + setNames(lapply(ufuns[[.GRP]], function(f) f(.SD)), varn) |
| 111 | + ), |
| 112 | + by = group, |
| 113 | + .SDcols = sdcols |
| 114 | + ] |
77 | 115 | toc(log = FALSE) |
78 | | - as.data.frame(dt) |
| 116 | + |
| 117 | + # reorder columns to exactly: (sdcols, group, new vars) |
| 118 | + setcolorder(out, c(sdcols, "group", varn)) |
| 119 | + out |
79 | 120 | } |
80 | 121 |
|
81 | | -# 4) Run & validate --------------------------------------------------------- |
| 122 | +# 4) Run & compare --------------------------------------------------------- |
| 123 | + |
82 | 124 | res_old <- old_group(df, utility_list) |
83 | 125 | res_new1 <- new1_group(df, utility_list) |
84 | 126 | res_new2 <- new2_group(df, utility_list) |
85 | 127 | res_new3 <- new3_group(df, utility_list) |
86 | 128 |
|
| 129 | +# numeric‐accuracy check (ignores type/class) |
87 | 130 | stopifnot( |
88 | | - identical(res_old, res_new1), |
89 | | - identical(res_new1, res_new2), |
90 | | - identical(res_new2, res_new3) |
| 131 | + isTRUE(all.equal(res_old, res_new1)), |
| 132 | + isTRUE(all.equal(res_new1, res_new2)), |
| 133 | + isTRUE(all.equal(res_new2, res_new3)) |
91 | 134 | ) |
92 | | -message("✅ All four methods agree on results.") |
| 135 | +message("✅ All four methods agree (within numeric tolerance).") |
0 commit comments