Merge branch 'master' into roll-nearest

MichaelChirico · web-flow · commit 0d26a8ccb50b · 2025-07-12T22:44:27.000-07:00
diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R
@@ -277,5 +277,14 @@ test.list <- atime::atime_test_list(
     Slow = "73d79edf8ff8c55163e90631072192301056e336",   # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/8397dc3c993b61a07a81c786ca68c22bc589befc)
     Fast = "8397dc3c993b61a07a81c786ca68c22bc589befc"),  # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7019/commits) that removes inefficiency
 
+  "isoweek improved in #7144" = atime::atime_test(
+    setup = {
+      set.seed(349)
+      x = sample(Sys.Date() - 0:5000, N, replace=TRUE)
+    },
+    expr = data.table::isoweek(x),
+    Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927",   # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927)
+    Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"),  # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation
+
     tests=extra.test.list)
 # nolint end: undesirable_operator_linter.
diff --git a/NEWS.md b/NEWS.md
diff --git a/R/IDateTime.R b/R/IDateTime.R
@@ -342,19 +342,20 @@ yday    = function(x) convertDate(as.IDate(x), "yday")
 wday    = function(x) convertDate(as.IDate(x), "wday")
 mday    = function(x) convertDate(as.IDate(x), "mday")
 week    = function(x) convertDate(as.IDate(x), "week")
-isoweek = function(x) {
+# TODO(#3279): Investigate if improved as.IDate() makes our below implementation faster than this
+isoweek = function(x) as.integer(format(as.IDate(x), "%V"))
   # ISO 8601-conformant week, as described at
   #   https://en.wikipedia.org/wiki/ISO_week_date
   # Approach:
   # * Find nearest Thursday to each element of x
   # * Find the number of weeks having passed between
   #   January 1st of the year of the nearest Thursdays and x
 
-  x = as.IDate(x)   # number of days since 1 Jan 1970 (a Thurs)
-  nearest_thurs = as.IDate(7L * (as.integer(x + 3L) %/% 7L))
-  year_start = as.IDate(format(nearest_thurs, '%Y-01-01'))
-  1L + (nearest_thurs - year_start) %/% 7L
-}
+#  x = as.IDate(x)   # number of days since 1 Jan 1970 (a Thurs)
+#  nearest_thurs = as.IDate(7L * (as.integer(x + 3L) %/% 7L))
+#  year_start = as.IDate(format(nearest_thurs, '%Y-01-01'))
+#  1L + (nearest_thurs - year_start) %/% 7L
+
 
 month   = function(x) convertDate(as.IDate(x), "month")
 quarter = function(x) convertDate(as.IDate(x), "quarter")
diff --git a/R/as.data.table.R b/R/as.data.table.R
@@ -48,6 +48,9 @@ as.data.table.matrix = function(x, keep.rownames=FALSE, key=NULL, ...) {
   if (!identical(keep.rownames, FALSE)) {
     # can specify col name to keep.rownames, #575
     ans = data.table(rn=rownames(x), x, keep.rownames=FALSE)
+    # auto-inferred name 'x' is not back-compatible & inconsistent, #7145
+    if (ncol(x) == 1L && is.null(colnames(x)))
+      setnames(ans, 'x', 'V1')
     if (is.character(keep.rownames))
       setnames(ans, 'rn', keep.rownames[1L])
     return(ans)
@@ -133,9 +136,26 @@ as.data.table.list = function(x,
   missing.check.names = missing(check.names)
   origListNames = if (missing(.named)) names(x) else NULL  # as.data.table called directly, not from inside data.table() which provides .named, #3854
   empty_atomic = FALSE
+
+  # Handle keep.rownames for vectors (mimicking data.frame behavior)
+  rownames_ = NULL
+  check_rownames = !isFALSE(keep.rownames)
+
   for (i in seq_len(n)) {
     xi = x[[i]]
     if (is.null(xi)) next    # eachncol already initialized to 0 by integer() above
+    if (check_rownames && is.null(rownames_)) {
+      if (is.null(dim(xi))) {
+        if (!is.null(nm <- names(xi))) {
+          rownames_ = nm
+          x[[i]] = unname(xi)
+        }
+      } else {
+        if (!is.null(nm <- rownames(xi))) {
+          rownames_ = nm
+        }
+      }
+    }
     if (!is.null(dim(xi)) && missing.check.names) check.names=TRUE
     if ("POSIXlt" %chin% class(xi)) {
       warningf("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.")
@@ -200,6 +220,18 @@ as.data.table.list = function(x,
   }
   if (any(vnames==".SD")) stopf("A column may not be called .SD. That has special meaning.")
   if (check.names) vnames = make.names(vnames, unique=TRUE)
+
+  # Add rownames column when vector names were found
+  if (!is.null(rownames_)) {
+    rn_name = if (is.character(keep.rownames)) keep.rownames[1L] else "rn"
+    if (!is.na(idx <- chmatch(rn_name, vnames)[1L])) {
+      ans = c(list(ans[[idx]]), ans[-idx])
+      vnames = c(vnames[idx], vnames[-idx])
+    } else {
+      ans = c(list(recycle(rownames_, nrow)), ans)
+      vnames = c(rn_name, vnames)
+    }
+  }
   setattr(ans, "names", vnames)
   setDT(ans, key=key) # copy ensured above; also, setDT handles naming
   if (length(origListNames)==length(ans)) setattr(ans, "names", origListNames)  # PR 3854 and tests 2058.15-17
diff --git a/R/cedta.R b/R/cedta.R
@@ -39,6 +39,16 @@ cedta.pkgEvalsUserCode = c("gWidgetsWWW","statET","FastRWeb","slidify","rmarkdow
 }
 # nocov end
 
+.any_sd_queries_in_stack = function(calls) {
+  for (ii in length(calls):1) { # nolint: seq_linter. As above.
+    if (!calls[[ii]] %iscall% "[") next
+    the_lhs = calls[[ii]][[2L]]
+    if (!is.name(the_lhs) || the_lhs != ".SD") next
+    return(TRUE)
+  }
+  FALSE
+}
+
 # cedta = Calling Environment Data.Table-Aware
 cedta = function(n=2L) {
   # Calling Environment Data Table Aware
@@ -52,12 +62,15 @@ cedta = function(n=2L) {
     return(TRUE)
   }
   nsname = getNamespaceName(ns)
+  sc = sys.calls()
   ans = nsname=="data.table" ||
     "data.table" %chin% names(getNamespaceImports(ns)) ||   # most common and recommended cases first for speed
     (nsname=="utils" &&
       (exists("debugger.look", parent.frame(n+1L)) ||
-      (length(sc<-sys.calls())>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972
-    (nsname=="base" && all(c("FUN", "X") %chin% ls(parent.frame(n)))) || # lapply
+      (length(sc)>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972
+    (nsname=="base" && # lapply
+      (all(c("FUN", "X") %chin% ls(parent.frame(n))) ||
+      .any_sd_queries_in_stack(sc))) ||
     (nsname %chin% cedta.pkgEvalsUserCode && .any_eval_calls_in_stack()) ||
     nsname %chin% cedta.override ||
     isTRUE(ns$.datatable.aware) ||  # As of Sep 2018: RCAS, caretEnsemble, dtplyr, rstanarm, rbokeh, CEMiTool, rqdatatable, RImmPort, BPRMeth, rlist
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -21281,13 +21281,46 @@ if (test_R.utils) local({
 })
 
 # Create a data.table when one vector is transposed doesn't respect the name defined by user #4124
-test(2321.1, DT <- data.table(a=1:2, b=matrix(1:2)), data.table(a=1:2, b=1:2))
-test(2321.2, names(DT), names(data.frame(a=1:2, b=matrix(1:2))))
-test(2321.3, DT <- data.table(a=integer(), b=matrix(1L, nrow=0L, ncol=1L)), data.table(a=integer(), b=integer()))
-test(2321.4, names(DT), names(data.frame(a=integer(), b=matrix(1L, nrow=0L, ncol=1L))))
+test(2321.01, DT <- data.table(a=1:2, b=matrix(1:2)), data.table(a=1:2, b=1:2))
+test(2321.02, names(DT), names(data.frame(a=1:2, b=matrix(1:2))))
+test(2321.03, DT <- data.table(a=integer(), b=matrix(1L, nrow=0L, ncol=1L)), data.table(a=integer(), b=integer()))
+test(2321.04, names(DT), names(data.frame(a=integer(), b=matrix(1L, nrow=0L, ncol=1L))))
 ## but respect named column vectors
-test(2321.5, DT <- data.table(a=1:2, cbind(b=3:4)), data.table(a=1:2, b=3:4))
-test(2321.6, names(DT), names(data.frame(a=1:2, cbind(b=3:4))))
+test(2321.05, DT <- data.table(a=1:2, cbind(b=3:4)), data.table(a=1:2, b=3:4))
+test(2321.06, names(DT), names(data.frame(a=1:2, cbind(b=3:4))))
+## also respect old naming pattern when invoked indirectly, #7145
+M = cbind(1:3)
+test(2321.07, as.data.table(M), data.table(V1=1:3))
+rownames(M) = c('a', 'b', 'c')
+test(2321.08, as.data.table(M), data.table(V1=1:3))
+test(2321.09, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3))
+colnames(M) = 'zz'
+test(2321.10, as.data.table(M), data.table(zz=1:3))
+test(2321.11, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), zz=1:3))
+colnames(M) = 'x'
+test(2321.12, as.data.table(M), data.table(x=1:3))
+test(2321.13, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), x=1:3))
+M = cbind(M, y=4:6)
+test(2321.14, as.data.table(M), data.table(x=1:3, y=4:6))
+test(2321.15, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), x=1:3, y=4:6))
+colnames(M) = c('A', 'B')
+test(2321.16, as.data.table(M), data.table(A=1:3, B=4:6))
+test(2321.17, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), A=1:3, B=4:6))
+colnames(M) = NULL
+test(2321.18, as.data.table(M), data.table(V1=1:3, V2=4:6))
+test(2321.19, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3, V2=4:6))
+colnames(M) = c('x', '')
+test(2321.20, as.data.table(M), data.table(x=1:3, V2=4:6))
+test(2321.21, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), x=1:3, V2=4:6))
+colnames(M) = c('', 'x')
+test(2321.22, as.data.table(M), data.table(V1=1:3, x=4:6))
+test(2321.23, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3, x=4:6))
+colnames(M) = c('', '')
+test(2321.24, as.data.table(M), data.table(V1=1:3, V2=4:6))
+test(2321.25, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3, V2=4:6))
+colnames(M) = c('A', '')
+test(2321.26, as.data.table(M), data.table(A=1:3, V2=4:6))
+test(2321.27, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), A=1:3, V2=4:6))
 
 # New fctr() helper: like factor() but retaining order by default #4837
 test(2322.01, levels(fctr(c("b","a","c"))), c("b","a","c"))
@@ -21378,3 +21411,26 @@ dt = data.table(x = 123456, y = "wide_string")
 test(2329.2, print(dt, col.names = "none"), output = "1: 123456 wide_string\n")
 dt = data.table(a = NA_integer_, b = NaN)
 test(2329.3, print(dt, col.names = "none"), output = "1: NA NaN\n")
+
+# Row name extraction from multiple vectors, #7136
+x <- 1:3 
+y <- setNames(4:6, c("A", "B", "C"))  
+test(2330.1, as.data.table(list(x, y), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), V1=1:3, V2=4:6))
+test(2330.2, as.data.table(list(x, y), keep.rownames="custom"), data.table(custom=c("A", "B", "C"), V1=1:3, V2=4:6))
+test(2330.3, as.data.table(list(y, x), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), V1=4:6, V2=1:3)) 
+
+# Behavior under data.frame()
+test(2330.4, as.data.table(data.frame(x, y), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), x=1:3, y=4:6))
+test(2330.5, as.data.table(data.frame(y, x), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), y=4:6, x=1:3))
+
+DF <- data.frame(row.names = letters[1:6], V = 1:6)     # Test data.frame with explicit rownames
+test(2330.6, as.data.table(list(a = 6:1, DF), keep.rownames=TRUE), data.table(rn=letters[1:6], a=6:1, V=1:6))
+
+z <- setNames(1:3, rep("", 3))  # vector with all-empty names     # behaviour with all-empty row names
+test(2330.7, as.data.table(list(z), keep.rownames=TRUE), data.table(rn=rep("", 3), V1=1:3))
+
+M <- matrix(1:6, nrow=3, dimnames=list(rep("", 3), c("V1", "V2")))   #  test of list(M) for empty-rowname'd matrix input
+test(2330.8, as.data.table(list(M), keep.rownames=TRUE), data.table(rn=rep("", 3), V1=1:3, V2=4:6))
+
+# .SD reference in '...' passed to lapply(FUN=) is recognized as data.table
+test(2331, lapply(list(data.table(a=1:2)), `[`, j=.SD[1L]), list(data.table(a=1L)))
diff --git a/man/as.data.table.Rd b/man/as.data.table.Rd
@@ -31,7 +31,7 @@ is.data.table(x)
 }
 \arguments{
   \item{x}{An R object.}
-  \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead.}
+  \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead. For lists and when calling \code{data.table()}, names from the first named vector are extracted and used as row names, similar to \code{data.frame()} behavior.}
   \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. }
   \item{sorted}{logical used in \emph{array} method, default \code{TRUE} is overridden when \code{key} is provided. }
   \item{value.name}{character scalar used in \emph{array} method, default \code{"value"}.}
diff --git a/man/data.table.Rd b/man/data.table.Rd
@@ -117,9 +117,11 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
 
     \item{keyby}{ Same as \code{by}, but with an additional \code{setkey()} run on the \code{by} columns of the result, for convenience. It is common practice to use \code{keyby=} routinely when you wish the result to be sorted. May also be \code{TRUE} or \code{FALSE} when \code{by} is provided as an alternative way to accomplish the same operation.}
 
-    \item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In case of overlapping variables names inside dataset and in parent scope you can use double dot prefix \code{..cols} to explicitly refer to \code{cols} variable parent scope and not from your dataset.
+    \item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In the case of overlapping variable names inside \code{x} and in parent scope, you can use the double dot prefix \code{..cols} to explicitly refer to the \code{cols} variable in parent scope and not from \code{x}.
 
-        When \code{j} is a character vector of column names, a numeric vector of column positions to select or of the form \code{startcol:endcol}, and the value returned is always a \code{data.table}. \code{with=FALSE} is not necessary anymore to select columns dynamically. Note that \code{x[, cols]} is equivalent to \code{x[, ..cols]} and to \code{x[, cols, with=FALSE]} and to \code{x[, .SD, .SDcols=cols]}.}
+        When \code{j} is a character vector of column names, a numeric vector of column positions to select, or of the form \code{startcol:endcol}, the value returned is always a \code{data.table}.
+
+        New code should rarely use this argument, which was originally needed for similarity to data.frame. For example, to select columns from a character vector \code{cols}, in data.frame we do \code{x[, cols]}, which has several equivalents in data.table: \code{x[, .SD, .SDcols=cols]}, \code{x[, ..cols]}, \code{x[, cols, env = list(cols = I(cols))]}, or \code{x[, cols, with=FALSE]}.}
 
     \item{nomatch}{ When a row in \code{i} has no match to \code{x}, \code{nomatch=NA} (default) means \code{NA} is returned. \code{NULL} (or \code{0} for backward compatibility) means no rows will be returned for that row of \code{i}. }
 
diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd
@@ -142,31 +142,31 @@ So far we've seen features of `melt` and `dcast` that are implemented efficientl
 However, there are situations we might run into where the desired operation is not expressed in a straightforward manner. For example, consider the `data.table` shown below:
 
 ```{r}
-s2 <- "family_id age_mother dob_child1 dob_child2 dob_child3 gender_child1 gender_child2 gender_child3
-1         30 1998-11-26 2000-01-29         NA             1             2            NA
-2         27 1996-06-22         NA         NA             2            NA            NA
-3         26 2002-07-11 2004-04-05 2007-09-02             2             2             1
-4         32 2004-10-10 2009-08-27 2012-07-21             1             1             1
-5         29 2000-12-05 2005-02-28         NA             2             1            NA"
+s2 <- "family_id age_mother name_child1 name_child2 name_child3 gender_child1 gender_child2 gender_child3
+         1         30         Ben        Anna          NA             1             2            NA
+         2         27         Tom          NA          NA             2            NA            NA
+         3         26         Lia         Sam         Amy             2             2             1
+         4         32         Max         Zoe         Joe             1             1             1
+         5         29         Dan         Eva          NA             2             1            NA"
 DT <- fread(s2)
 DT
 ## 1 = female, 2 = male
 ```
 
-And you'd like to combine (`melt`) all the `dob` columns together, and `gender` columns together. Using the old functionality, we could do something like this:
+And you'd like to combine (`melt`) all the `name` columns together, and `gender` columns together. Using the old functionality, we could do something like this:
 
 ```{r}
 DT.m1 = melt(DT, id.vars = c("family_id", "age_mother"))
 DT.m1[, c("variable", "child") := tstrsplit(variable, "_", fixed = TRUE)]
 DT.c1 = dcast(DT.m1, family_id + age_mother + child ~ variable, value.var = "value")
 DT.c1
 
-str(DT.c1) ## gender column is class IDate now!
+str(DT.c1) ## gender column is character type now!
 ```
 
 #### Issues
 
-1. What we wanted to do was to combine all the `dob` and `gender` type columns together respectively. Instead, we are combining *everything* together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient).
+1. What we wanted to do was to combine all the `name` and `gender` type columns together respectively. Instead, we are combining *everything* together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient).
 
     As an analogy, imagine you've a closet with four shelves of clothes and you'd like to put together the clothes from shelves 1 and 2 together (in 1), and 3 and 4 together (in 3). What we are doing is more or less to combine all the clothes together, and then split them back on to shelves 1 and 3!
 
@@ -189,9 +189,9 @@ Since we'd like for `data.table`s to perform this operation straightforward and
 The idea is quite simple. We pass a list of columns to `measure.vars`, where each element of the list contains the columns that should be combined together.
 
 ```{r}
-colA = paste0("dob_child", 1:3)
+colA = paste0("name_child", 1:3)
 colB = paste0("gender_child", 1:3)
-DT.m2 = melt(DT, measure.vars = list(colA, colB), value.name = c("dob", "gender"))
+DT.m2 = melt(DT, measure.vars = list(colA, colB), value.name = c("name", "gender"))
 DT.m2
 
 str(DT.m2) ## col type is preserved
@@ -206,7 +206,7 @@ str(DT.m2) ## col type is preserved
 Usually in these problems, the columns we'd like to melt can be distinguished by a common pattern. We can use the function `patterns()`, implemented for convenience, to provide regular expressions for the columns to be combined together. The above operation can be rewritten as:
 
 ```{r}
-DT.m2 = melt(DT, measure.vars = patterns("^dob", "^gender"), value.name = c("dob", "gender"))
+DT.m2 = melt(DT, measure.vars = patterns("^name", "^gender"), value.name = c("name", "gender"))
 DT.m2
 ```
 
@@ -305,7 +305,7 @@ We can now provide **multiple `value.var` columns** to `dcast` for `data.table`s
 
 ```{r}
 ## new 'cast' functionality - multiple value.vars
-DT.c2 = dcast(DT.m2, family_id + age_mother ~ variable, value.var = c("dob", "gender"))
+DT.c2 = dcast(DT.m2, family_id + age_mother ~ variable, value.var = c("name", "gender"))
 DT.c2
 ```
 

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ is.data.table(x)`
`31`	`31`	`}`
`32`	`32`	`\arguments{`
`33`	`33`	`\item{x}{An R object.}`
`34`		`- \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead.}`
	`34`	`+ \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead. For lists and when calling \code{data.table()}, names from the first named vector are extracted and used as row names, similar to \code{data.frame()} behavior.}`
`35`	`35`	`\item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. }`
`36`	`36`	`\item{sorted}{logical used in \emph{array} method, default \code{TRUE} is overridden when \code{key} is provided. }`
`37`	`37`	`\item{value.name}{character scalar used in \emph{array} method, default \code{"value"}.}`