Skip to content

Commit a1cbe53

Browse files
authored
Merge branch 'master' into merge_factor_char_key
2 parents dda8722 + c16f320 commit a1cbe53

22 files changed

+1060
-49
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ export(nafill)
5959
export(setnafill)
6060
export(.Last.updated)
6161
export(fcoalesce)
62+
export(cbindlist, setcbindlist)
6263
export(substitute2)
6364
#export(DT) # mtcars |> DT(i,j,by) #4872 #5472
6465
export(fctr)

NEWS.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
# 2: 2 6 4 5
4141
```
4242

43+
8. `groupingsets()` gets a new argument `enclos` for use together with the `jj` argument in functions wrapping `groupingsets()`, including the existing wrappers `rollup()` and `cube()`. When forwarding a `j`-expression as `groupingsets(jj = substitute(j))`, make sure to pass `enclos = parent.frame()` as well, so that the `j`-expression will be evaluated in the right context. This makes it possible for `j` to refer to variables outside the `data.table`.
44+
4345
### BUG FIXES
4446

4547
1. Custom binary operators from the `lubridate` package now work with objects of class `IDate` as with a `Date` subclass, [#6839](https://github.com/Rdatatable/data.table/issues/6839). Thanks @emallickhossain for the report and @aitap for the fix.
@@ -72,7 +74,11 @@
7274
7375
15. Including an `ITime` object as a named input to `data.frame()` respects the provided name, i.e. `data.frame(a = as.ITime(...))` will have column `a`, [#4673](https://github.com/Rdatatable/data.table/issues/4673). Thanks @shrektan for the report and @MichaelChirico for the fix.
7476
75-
16. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
77+
16. `fread()` now handles the `na.strings` argument for quoted text columns, making it possible to specify `na.strings = '""'` and read empty quoted strings as `NA`s, [#6974](https://github.com/Rdatatable/data.table/issues/6974). Thanks to @AngelFelizR for the report and @aitap for the PR.
78+
79+
17. A data.table with a column of class `vctrs_list_of` (from package {vctrs}) prints as expected, [#5948](https://github.com/Rdatatable/data.table/issues/5948). Before, they could be printed messily, e.g. printing every entry in a nested data.frame. Thanks @jesse-smith for the report, @DavisVaughan and @r2evans for contributing, and @MichaelChirico for the PR.
80+
81+
18. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
7682
7783
### NOTES
7884

R/data.table.R

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -542,12 +542,25 @@ replace_dot_alias = function(e) {
542542
# Really, `anyDuplicated` in base is AWESOME!
543543
# allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
544544
if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
545-
irows = if (allLen1) f__ else vecseq(f__,len__,
546-
if (allow.cartesian ||
547-
notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
548-
!anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
549-
NULL # #742. If 'i' has no duplicates, ignore
550-
} else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
545+
if (allLen1) {
546+
irows = f__
547+
} else {
548+
join.many = isTRUE(getOption("datatable.join.many")) # #914, default TRUE for backward compatibility
549+
anyDups = !notjoin &&
550+
(
551+
# #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
552+
(join.many && !allow.cartesian) ||
553+
# special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
554+
(!join.many && (length(f__) != 1L || len__ != nrow(x)))
555+
) &&
556+
anyDuplicated(f__, incomparables = c(0L, NA_integer_)) > 0L
557+
limit = if (anyDups) { # #742. If 'i' has no duplicates, ignore
558+
if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
559+
if (allow.cartesian) internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov
560+
as.double(nrow(x)+nrow(i)) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
561+
}
562+
irows = vecseq(f__, len__, limit)
563+
}
551564
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
552565
# Fix for #1092 and #1074
553566
# TODO: implement better version of "any"/"all"/"which" to avoid

R/groupingsets.R

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) {
1313
sets = lapply(length(by):0L, function(i) by[0L:i])
1414
# redirect to workhorse function
1515
jj = substitute(j)
16-
groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label)
16+
groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame())
1717
}
1818

1919
cube = function(x, ...) {
@@ -35,13 +35,13 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) {
3535
sets = lapply((2L^n):1L, function(jj) by[keepBool[jj, ]])
3636
# redirect to workhorse function
3737
jj = substitute(j)
38-
groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label)
38+
groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame())
3939
}
4040

4141
groupingsets = function(x, ...) {
4242
UseMethod("groupingsets")
4343
}
44-
groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, ...) {
44+
groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, enclos = parent.frame(), ...) {
4545
# input data type basic validation
4646
if (!is.data.table(x))
4747
stopf("Argument 'x' must be a data.table object")
@@ -112,7 +112,10 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe
112112
.SDcols = if (".SD" %chin% av) setdiff(names(x), by) else NULL
113113
if (length(names(by))) by = unname(by)
114114
# 0 rows template data.table to keep colorder and type
115-
empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by]
115+
# inline all arguments that might clash with enclosing environment
116+
pcall = substitute(x[0L, jj, by], list(x = x, jj = jj, by = by))
117+
if (length(.SDcols)) pcall$.SDcols = .SDcols
118+
empty = eval(pcall, list(.datatable.aware = TRUE), enclos)
116119
if (id && "grouping" %chin% names(empty)) # `j` could have been evaluated to `grouping` field
117120
stopf("When using `id=TRUE` the 'j' expression must not evaluate to a column named 'grouping'.")
118121
if (anyDuplicated(names(empty)) > 0L)
@@ -150,8 +153,12 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe
150153
stopf("Using integer64 class columns require to have 'bit64' package installed.") # nocov
151154
int64.by.cols = intersect(int64.cols, by)
152155
# aggregate function called for each grouping set
156+
# inline all arguments that might clash with enclosing environment
157+
pcall = substitute(x[, jj], list(x = x, jj = jj))
158+
if (length(.SDcols)) pcall$.SDcols = .SDcols
153159
aggregate.set = function(by.set) {
154-
r = if (length(.SDcols)) x[, eval(jj), by.set, .SDcols=.SDcols] else x[, eval(jj), by.set]
160+
pcall$by = by.set
161+
r = eval(pcall, list(.datatable.aware = TRUE), enclos)
155162
if (id) {
156163
# integer bit mask of aggregation levels: http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE
157164
# 3267: strtoi("", base = 2L) output apparently unstable across platforms

0 commit comments

Comments
 (0)