Skip to content

Commit a4c7e7a

Browse files
committed
Merge branch 'issue_2606' of https://github.com/Rdatatable/data.table into issue_2606
2 parents 8f5ffa8 + 9018240 commit a4c7e7a

File tree

5 files changed

+109
-11
lines changed

5 files changed

+109
-11
lines changed

.ci/atime/tests.R

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,5 +286,18 @@ test.list <- atime::atime_test_list(
286286
Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927)
287287
Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation
288288

289+
"tables() !recursive refactor in #2606" = atime::atime_test(
290+
N = as.integer(10^seq(1, 4, by=0.5)),
291+
setup = {
292+
test_env <- new.env()
293+
for (i in 1:N) {
294+
assign(paste0("dt_perf_test", i), data.table(a=1), envir = test_env)
295+
assign(paste0("vec_perf_test", i), 1, envir = test_env)
296+
}
297+
},
298+
expr = {data.table::tables(env = test_env, silent = TRUE, index = TRUE); NULL},
299+
"before" = "5bb645082aa5c4a295cdd211a5a75c849d590b75",
300+
"after" = "8978cf201d8d228506e1e96d3eda7e542471720a"),
301+
289302
tests=extra.test.list)
290303
# nolint end: undesirable_operator_linter.

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@
5656

5757
13. New `mergelist()` and `setmergelist()` similarly work _a la_ `Reduce()` to recursively merge a `list` of data.tables, [#599](https://github.com/Rdatatable/data.table/issues/599). Different join modes (_left_, _inner_, _full_, _right_, _semi_, _anti_, and _cross_) are supported through the `how` argument; duplicate handling goes through the `mult` argument. `setmergelist()` carefully avoids copies where one is not needed, e.g. in a 1:1 left join. Thanks Patrick Nicholson for the FR (in 2013!), @jangorecki for the PR, and @MichaelChirico for extensive reviews and fine-tuning.
5858

59+
14. `tables()` now supports a `recursive=TRUE` argument to detect `data.table` objects nested within plain lists, such as those produced by `split()` or manual list construction, [#2606](https://github.com/Rdatatable/data.table/issues/2606). The recursive search skips data.frame and data.table objects to avoid descending into list-columns. Nested data.tables are reported with intuitive R-like names using $ and [[ ]] notation. Thanks to @MichaelChirico for the suggestion and @venom1204 for the implementation.
60+
5961
### BUG FIXES
6062

6163
1. `fread()` no longer warns on certain systems on R 4.5.0+ where the file owner can't be resolved, [#6918](https://github.com/Rdatatable/data.table/issues/6918). Thanks @ProfFancyPants for the report and PR.

R/tables.R

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,30 +19,68 @@ type_size = function(DT) {
1919
}
2020

2121
tables = function(mb=type_size, order.col="NAME", width=80L,
22-
env=parent.frame(), silent=FALSE, index=FALSE)
22+
env=parent.frame(), silent=FALSE, index=FALSE, recursive=FALSE)
2323
{
2424
# Prints name, size and colnames of all data.tables in the calling environment by default
2525
mb_name = as.character(substitute(mb))
2626
if (isTRUE(mb)) { mb=type_size; mb_name="type_size" }
2727
names = ls(envir=env, all.names=TRUE) # include "hidden" objects (starting with .)
28-
obj = mget(names, envir=env) # doesn't copy; mget is ok with ... unlike get, #5197
29-
w = which(vapply_1b(obj, is.data.table))
30-
if (!length(w)) {
28+
objs = mget(names, envir=env) # doesn't copy; mget is ok with ... unlike get, #5197
29+
found_items = list()
30+
if (recursive) {
31+
agenda = mapply(function(obj, name) list(obj=obj, name=name), objs, names, SIMPLIFY=FALSE, USE.NAMES=FALSE)
32+
visited_env = new.env(hash=TRUE)
33+
34+
while (length(agenda)) {
35+
current_item = agenda[[1L]]
36+
agenda[[1L]] = NULL
37+
x = current_item$obj
38+
x_name = current_item$name
39+
if (is.data.table(x)) {
40+
found_items[[length(found_items) + 1L]] = list(name=x_name, obj=x)
41+
next
42+
}
43+
if (is.list(x) && !is.data.frame(x)) {
44+
# Cycle detection
45+
addr = address(x)
46+
if (exists(addr, envir=visited_env, inherits=FALSE)) next
47+
assign(addr, TRUE, envir=visited_env)
48+
49+
item_names = names(x)
50+
children_to_add = vector("list", length(x))
51+
for (i in seq_along(x)) {
52+
child_name = if (!is.null(item_names) && nzchar(item_names[i])) {
53+
paste0(x_name, "$", item_names[i])
54+
} else {
55+
paste0(x_name, "[[", i, "]]")
56+
}
57+
children_to_add[[i]] = list(obj=x[[i]], name=child_name)
58+
}
59+
agenda = c(rev(children_to_add), agenda)
60+
}
61+
}
62+
} else {
63+
w = which(vapply_1b(objs, is.data.table))
64+
if (length(w)) {
65+
found_items = lapply(w, function(i) list(name=names[i], obj=objs[[i]]))
66+
}
67+
}
68+
if (!length(found_items)) {
3169
if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env))
3270
return(invisible(data.table(NULL)))
3371
}
34-
info = data.table(NAME=names[w], NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list())
35-
for (i in seq_along(w)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables
36-
DT = obj[[w[i]]]
72+
info = data.table(NAME=vapply_1c(found_items, `[[`, "name"), NROW=0L, NCOL=0L, MB=0.0, COLS=list(), KEY=list(), INDICES=list())
73+
for (i in seq_along(found_items)) { # avoid rbindlist(lapply(DT_names)) in case of a large number of tables
74+
DT = found_items[[i]]$obj
3775
set(info, i, "NROW", nrow(DT))
3876
set(info, i, "NCOL", ncol(DT))
3977
if (is.function(mb)) set(info, i, "MB", as.integer(mb(DT)/1048576L)) # i.e. 1024**2
4078
if (!is.null(tt<-names(DT))) set(info, i, "COLS", tt) # TODO: don't need these if()s when #5526 is done
4179
if (!is.null(tt<-key(DT))) set(info, i, "KEY", tt)
4280
if (index && !is.null(tt<-indices(DT))) set(info, i, "INDICES", tt)
4381
}
44-
if (!is.function(mb)) info[,MB:=NULL]
45-
if (!index) info[,INDICES:=NULL]
82+
if (!is.function(mb)) info$MB = NULL
83+
if (!index) info$INDICES = NULL
4684
if (!order.col %chin% names(info)) stopf("order.col='%s' not a column name of info", order.col)
4785
info = info[base::order(info[[order.col]])] # base::order to maintain locale ordering of table names
4886
if (!silent) {

inst/tests/tests.Rraw

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21536,3 +21536,37 @@ f = tempfile()
2153621536
writeLines(c('a', rep('0x1.ffffp0', 10000L), '0x1.ff\x9fp0', rep('0x1.ffffp0', 20000L)), f)
2153721537
test(2334, names(fread(f)), "a")
2153821538
unlink(f)
21539+
21540+
#2606 Recursive tables() naming convention
21541+
local({
21542+
lst_named <- list(inner=data.table(a=1))
21543+
lst_unnamed <- list(data.table(b=2))
21544+
nested <- list(l1=list(l2=data.table(c=3)))
21545+
mixed <- list(data.table(x=1), y=data.table(z=2))
21546+
mixed_nested <- list(A=list(data.table(p=1), q=data.table(q=2)))
21547+
out <- tables(recursive=TRUE)$NAME
21548+
expected <- c(
21549+
"lst_named$inner", "lst_unnamed[[1]]", "nested$l1$l2",
21550+
"mixed[[1]]", "mixed$y",
21551+
"mixed_nested$A[[1]]", "mixed_nested$A$q")
21552+
test(2335.1, out, sort(expected))
21553+
})
21554+
local({
21555+
dt <- data.table(val=42)
21556+
e <- new.env()
21557+
e$dt <- dt
21558+
e$self <- e # possible infinite loop if we're not careful
21559+
test(2335.2, tables(recursive=TRUE, env=e)$NAME, "dt")
21560+
})
21561+
local({
21562+
test_obj <- local({
21563+
common_list <- list(dt_inner=data.table(d=4))
21564+
outer_list <- list(first=common_list, unique=data.table(e=5))
21565+
outer_list$second <- outer_list$first
21566+
outer_list
21567+
})
21568+
out <- tables(recursive=TRUE)$NAME
21569+
test(2335.3, length(out), 2L)
21570+
test(2335.4, "test_obj$unique" %in% out)
21571+
test(2335.5, sum(grepl("\\$dt_inner$", out)), 1L)
21572+
})

man/tables.Rd

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
Convenience function for concisely summarizing some metadata of all \code{data.table}s in memory (or an optionally specified environment).
66
}
77
\usage{
8-
tables(mb=type_size, order.col="NAME", width=80,
9-
env=parent.frame(), silent=FALSE, index=FALSE)
8+
tables(mb=type_size, order.col="NAME", width=80L,
9+
env=parent.frame(), silent=FALSE, index=FALSE,
10+
recursive=FALSE)
1011
}
1112
\arguments{
1213
\item{mb}{ a function which accepts a \code{data.table} and returns its size in bytes. By default, \code{type_size} (same as \code{TRUE}) provides a fast lower bound by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). A column \code{"MB"} is included in the output unless \code{FALSE} or \code{NULL}. }
@@ -15,6 +16,9 @@ tables(mb=type_size, order.col="NAME", width=80,
1516
\item{env}{ An \code{environment}, typically the \code{.GlobalEnv} by default, see Details. }
1617
\item{silent}{ \code{logical}; should the output be printed? }
1718
\item{index}{ \code{logical}; if \code{TRUE}, the column \code{INDICES} is added to indicate the indices assorted with each object, see \code{\link{indices}}. }
19+
\item{recursive}{ \code{logical}; if \code{TRUE}, \code{tables} will perform a full,
20+
iterative search into list objects to find nested data.tables.
21+
Defaults to \code{FALSE} for backward compatibility. }
1822
}
1923
\details{
2024
Usually \code{tables()} is executed at the prompt, where \code{parent.frame()} returns \code{.GlobalEnv}. \code{tables()} may also be useful inside functions where \code{parent.frame()} is the local scope of the function; in such a scenario, simply set it to \code{.GlobalEnv} to get the same behaviour as at prompt.
@@ -32,5 +36,12 @@ DT = data.table(A=1:10, B=letters[1:10])
3236
DT2 = data.table(A=1:10000, ColB=10000:1)
3337
setkey(DT,B)
3438
tables()
39+
40+
# Finding data.tables nested in a list
41+
dt_list <- list(a = data.table(x=1:5), b = data.table(y=6:10))
42+
# By default, nested tables are not shown:
43+
tables()
44+
# Use recursive=TRUE to find them:
45+
tables(recursive = TRUE)
3546
}
3647
\keyword{ data }

0 commit comments

Comments
 (0)