Merge branch 'master' of https://github.com/Rdatatable/data.table into issue_2606

venom1204 · venom1204 · commit a04d3ede03b6 · 2025-07-12T02:31:31.000Z
diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R
@@ -277,5 +277,14 @@ test.list <- atime::atime_test_list(
     Slow = "73d79edf8ff8c55163e90631072192301056e336",   # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/8397dc3c993b61a07a81c786ca68c22bc589befc)
     Fast = "8397dc3c993b61a07a81c786ca68c22bc589befc"),  # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7019/commits) that removes inefficiency
 
+  "isoweek improved in #7144" = atime::atime_test(
+    setup = {
+      set.seed(349)
+      x = sample(Sys.Date() - 0:5000, N, replace=TRUE)
+    },
+    expr = data.table::isoweek(x),
+    Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927",   # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927)
+    Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"),  # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation
+
     tests=extra.test.list)
 # nolint end: undesirable_operator_linter.
diff --git a/.ci/linters/md/news_github_link_mismatch_linter.R b/.ci/linters/md/news_github_link_mismatch_linter.R
@@ -1,32 +1,6 @@
-# ensure that numbered list in each section is in sequence
-check_section_numbering = function(news) {
-  if (!grepl("NEWS", news)) return(invisible())
-  news = readLines(news)
-  # plain '#' catches some examples; 'd' for 'data.table'
-  sections = grep("^#+ [A-Zd]", news)
-  entries = grep("^[0-9]+[.]", news)
-  entry_value = as.integer(gsub("^([0-9]+)[.].*", "\\1", news[entries]))
-  section_id = findInterval(entries, sections)
-
-  any_mismatch = FALSE
-  for (id in unique(section_id)) {
-    section_entries = entry_value[section_id == id]
-    intended_value = seq_along(section_entries)
-    matched = section_entries == intended_value
-    if (all(matched)) next
-    any_mismatch = TRUE
-    section_header = news[sections[id]]
-    cat(sprintf(
-      "In section '%s' (line %d), bad numbering:\n%s\n",
-      section_header, sections[id],
-      paste0("  [", section_entries[!matched], " --> ", intended_value[!matched], "]", collapse="\n")
-    ))
-  }
-  stopifnot("Please fix the NEWS issues above" = !any_mismatch)
-}
-
 # ensure that GitHub link text & URL actually agree
-check_gh_links = function(news) {
+news_github_link_mismatch_linter = function(news) {
+  if (!grepl("NEWS", news)) return(invisible())
   news = readLines(news)
   gh_links_info = gregexpr(
     "\\[#(?<md_number>[0-9]+)\\]\\(https://github.com/Rdatatable/data.table/(?<link_type>[^/]+)/(?<link_number>[0-9]+)\\)",
diff --git a/.ci/linters/md/news_section_numbering_linter.R b/.ci/linters/md/news_section_numbering_linter.R
@@ -0,0 +1,26 @@
+# ensure that numbered list in each section is in sequence
+news_section_numbering_linter = function(news) {
+  if (!grepl("NEWS", news)) return(invisible())
+  news = readLines(news)
+  # plain '#' catches some examples; 'd' for 'data.table'
+  sections = grep("^#+ [A-Zd]", news)
+  entries = grep("^[0-9]+[.]", news)
+  entry_value = as.integer(gsub("^([0-9]+)[.].*", "\\1", news[entries]))
+  section_id = findInterval(entries, sections)
+
+  any_mismatch = FALSE
+  for (id in unique(section_id)) {
+    section_entries = entry_value[section_id == id]
+    intended_value = seq_along(section_entries)
+    matched = section_entries == intended_value
+    if (all(matched)) next
+    any_mismatch = TRUE
+    section_header = news[sections[id]]
+    cat(sprintf(
+      "In section '%s' (line %d), bad numbering:\n%s\n",
+      section_header, sections[id],
+      paste0("  [", section_entries[!matched], " --> ", intended_value[!matched], "]", collapse="\n")
+    ))
+  }
+  stopifnot("Please fix the NEWS issues above" = !any_mismatch)
+}
diff --git a/.ci/linters/md/vignette_heading_id_linter.R b/.ci/linters/md/vignette_heading_id_linter.R
@@ -1,6 +1,6 @@
 # ensure that ids are limited to alphanumerics and dashes
 # (in particular, dots and underscores break the links)
-check_header_ids = function(md) {
+vignette_heading_id_linter = function(md) {
   if (!grepl('[.]Rmd$', md)) return(invisible())
   md = readLines(md)
   # A bit surprisingly, some headings don't start with a letter.
diff --git a/NEWS.md b/NEWS.md
diff --git a/R/IDateTime.R b/R/IDateTime.R
@@ -342,19 +342,20 @@ yday    = function(x) convertDate(as.IDate(x), "yday")
 wday    = function(x) convertDate(as.IDate(x), "wday")
 mday    = function(x) convertDate(as.IDate(x), "mday")
 week    = function(x) convertDate(as.IDate(x), "week")
-isoweek = function(x) {
+# TODO(#3279): Investigate if improved as.IDate() makes our below implementation faster than this
+isoweek = function(x) as.integer(format(as.IDate(x), "%V"))
   # ISO 8601-conformant week, as described at
   #   https://en.wikipedia.org/wiki/ISO_week_date
   # Approach:
   # * Find nearest Thursday to each element of x
   # * Find the number of weeks having passed between
   #   January 1st of the year of the nearest Thursdays and x
 
-  x = as.IDate(x)   # number of days since 1 Jan 1970 (a Thurs)
-  nearest_thurs = as.IDate(7L * (as.integer(x + 3L) %/% 7L))
-  year_start = as.IDate(format(nearest_thurs, '%Y-01-01'))
-  1L + (nearest_thurs - year_start) %/% 7L
-}
+#  x = as.IDate(x)   # number of days since 1 Jan 1970 (a Thurs)
+#  nearest_thurs = as.IDate(7L * (as.integer(x + 3L) %/% 7L))
+#  year_start = as.IDate(format(nearest_thurs, '%Y-01-01'))
+#  1L + (nearest_thurs - year_start) %/% 7L
+
 
 month   = function(x) convertDate(as.IDate(x), "month")
 quarter = function(x) convertDate(as.IDate(x), "quarter")
diff --git a/R/as.data.table.R b/R/as.data.table.R
@@ -48,6 +48,9 @@ as.data.table.matrix = function(x, keep.rownames=FALSE, key=NULL, ...) {
   if (!identical(keep.rownames, FALSE)) {
     # can specify col name to keep.rownames, #575
     ans = data.table(rn=rownames(x), x, keep.rownames=FALSE)
+    # auto-inferred name 'x' is not back-compatible & inconsistent, #7145
+    if (ncol(x) == 1L && is.null(colnames(x)))
+      setnames(ans, 'x', 'V1')
     if (is.character(keep.rownames))
       setnames(ans, 'rn', keep.rownames[1L])
     return(ans)
@@ -133,9 +136,26 @@ as.data.table.list = function(x,
   missing.check.names = missing(check.names)
   origListNames = if (missing(.named)) names(x) else NULL  # as.data.table called directly, not from inside data.table() which provides .named, #3854
   empty_atomic = FALSE
+
+  # Handle keep.rownames for vectors (mimicking data.frame behavior)
+  rownames_ = NULL
+  check_rownames = !isFALSE(keep.rownames)
+
   for (i in seq_len(n)) {
     xi = x[[i]]
     if (is.null(xi)) next    # eachncol already initialized to 0 by integer() above
+    if (check_rownames && is.null(rownames_)) {
+      if (is.null(dim(xi))) {
+        if (!is.null(nm <- names(xi))) {
+          rownames_ = nm
+          x[[i]] = unname(xi)
+        }
+      } else {
+        if (!is.null(nm <- rownames(xi))) {
+          rownames_ = nm
+        }
+      }
+    }
     if (!is.null(dim(xi)) && missing.check.names) check.names=TRUE
     if ("POSIXlt" %chin% class(xi)) {
       warningf("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.")
@@ -200,6 +220,18 @@ as.data.table.list = function(x,
   }
   if (any(vnames==".SD")) stopf("A column may not be called .SD. That has special meaning.")
   if (check.names) vnames = make.names(vnames, unique=TRUE)
+
+  # Add rownames column when vector names were found
+  if (!is.null(rownames_)) {
+    rn_name = if (is.character(keep.rownames)) keep.rownames[1L] else "rn"
+    if (!is.na(idx <- chmatch(rn_name, vnames)[1L])) {
+      ans = c(list(ans[[idx]]), ans[-idx])
+      vnames = c(vnames[idx], vnames[-idx])
+    } else {
+      ans = c(list(recycle(rownames_, nrow)), ans)
+      vnames = c(rn_name, vnames)
+    }
+  }
   setattr(ans, "names", vnames)
   setDT(ans, key=key) # copy ensured above; also, setDT handles naming
   if (length(origListNames)==length(ans)) setattr(ans, "names", origListNames)  # PR 3854 and tests 2058.15-17
diff --git a/R/print.data.table.R b/R/print.data.table.R
@@ -141,6 +141,8 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
     print_default(toprint)
     return(invisible(x))
   }
+  if (col.names == "none")
+    colnames(toprint) = rep.int("", ncol(toprint))
   if (nrow(toprint)>20L && col.names == "auto")
     # repeat colnames at the bottom if over 20 rows so you don't have to scroll up to see them
     #   option to shut this off per request of Oleg Bondar on SO, #1482
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -21276,13 +21276,46 @@ if (test_R.utils) local({
 })
 
 # Create a data.table when one vector is transposed doesn't respect the name defined by user #4124
-test(2321.1, DT <- data.table(a=1:2, b=matrix(1:2)), data.table(a=1:2, b=1:2))
-test(2321.2, names(DT), names(data.frame(a=1:2, b=matrix(1:2))))
-test(2321.3, DT <- data.table(a=integer(), b=matrix(1L, nrow=0L, ncol=1L)), data.table(a=integer(), b=integer()))
-test(2321.4, names(DT), names(data.frame(a=integer(), b=matrix(1L, nrow=0L, ncol=1L))))
+test(2321.01, DT <- data.table(a=1:2, b=matrix(1:2)), data.table(a=1:2, b=1:2))
+test(2321.02, names(DT), names(data.frame(a=1:2, b=matrix(1:2))))
+test(2321.03, DT <- data.table(a=integer(), b=matrix(1L, nrow=0L, ncol=1L)), data.table(a=integer(), b=integer()))
+test(2321.04, names(DT), names(data.frame(a=integer(), b=matrix(1L, nrow=0L, ncol=1L))))
 ## but respect named column vectors
-test(2321.5, DT <- data.table(a=1:2, cbind(b=3:4)), data.table(a=1:2, b=3:4))
-test(2321.6, names(DT), names(data.frame(a=1:2, cbind(b=3:4))))
+test(2321.05, DT <- data.table(a=1:2, cbind(b=3:4)), data.table(a=1:2, b=3:4))
+test(2321.06, names(DT), names(data.frame(a=1:2, cbind(b=3:4))))
+## also respect old naming pattern when invoked indirectly, #7145
+M = cbind(1:3)
+test(2321.07, as.data.table(M), data.table(V1=1:3))
+rownames(M) = c('a', 'b', 'c')
+test(2321.08, as.data.table(M), data.table(V1=1:3))
+test(2321.09, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3))
+colnames(M) = 'zz'
+test(2321.10, as.data.table(M), data.table(zz=1:3))
+test(2321.11, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), zz=1:3))
+colnames(M) = 'x'
+test(2321.12, as.data.table(M), data.table(x=1:3))
+test(2321.13, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), x=1:3))
+M = cbind(M, y=4:6)
+test(2321.14, as.data.table(M), data.table(x=1:3, y=4:6))
+test(2321.15, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), x=1:3, y=4:6))
+colnames(M) = c('A', 'B')
+test(2321.16, as.data.table(M), data.table(A=1:3, B=4:6))
+test(2321.17, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), A=1:3, B=4:6))
+colnames(M) = NULL
+test(2321.18, as.data.table(M), data.table(V1=1:3, V2=4:6))
+test(2321.19, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3, V2=4:6))
+colnames(M) = c('x', '')
+test(2321.20, as.data.table(M), data.table(x=1:3, V2=4:6))
+test(2321.21, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), x=1:3, V2=4:6))
+colnames(M) = c('', 'x')
+test(2321.22, as.data.table(M), data.table(V1=1:3, x=4:6))
+test(2321.23, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3, x=4:6))
+colnames(M) = c('', '')
+test(2321.24, as.data.table(M), data.table(V1=1:3, V2=4:6))
+test(2321.25, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3, V2=4:6))
+colnames(M) = c('A', '')
+test(2321.26, as.data.table(M), data.table(A=1:3, V2=4:6))
+test(2321.27, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), A=1:3, V2=4:6))
 
 # New fctr() helper: like factor() but retaining order by default #4837
 test(2322.01, levels(fctr(c("b","a","c"))), c("b","a","c"))
@@ -21366,27 +21399,55 @@ DT[, i := integer()]
 DT[, f2 := factor()]
 test(2328.2, droplevels(DT), data.table(f=factor(), i=integer(), f2=factor()))
 
+#6882 print() output with col.names="none"
+dt = data.table(short = 1:3, verylongcolumnname = 4:6)
+test(2329.1, print(dt, col.names = "none"), output = "1: 1 4\n2: 2 5\n3: 3 6\n")
+dt = data.table(x = 123456, y = "wide_string")
+test(2329.2, print(dt, col.names = "none"), output = "1: 123456 wide_string\n")
+dt = data.table(a = NA_integer_, b = NaN)
+test(2329.3, print(dt, col.names = "none"), output = "1: NA NaN\n")
+
+# Row name extraction from multiple vectors, #7136
+x <- 1:3 
+y <- setNames(4:6, c("A", "B", "C"))  
+test(2330.1, as.data.table(list(x, y), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), V1=1:3, V2=4:6))
+test(2330.2, as.data.table(list(x, y), keep.rownames="custom"), data.table(custom=c("A", "B", "C"), V1=1:3, V2=4:6))
+test(2330.3, as.data.table(list(y, x), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), V1=4:6, V2=1:3)) 
+
+# Behavior under data.frame()
+test(2330.4, as.data.table(data.frame(x, y), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), x=1:3, y=4:6))
+test(2330.5, as.data.table(data.frame(y, x), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), y=4:6, x=1:3))
+
+DF <- data.frame(row.names = letters[1:6], V = 1:6)     # Test data.frame with explicit rownames
+test(2330.6, as.data.table(list(a = 6:1, DF), keep.rownames=TRUE), data.table(rn=letters[1:6], a=6:1, V=1:6))
+
+z <- setNames(1:3, rep("", 3))  # vector with all-empty names     # behaviour with all-empty row names
+test(2330.7, as.data.table(list(z), keep.rownames=TRUE), data.table(rn=rep("", 3), V1=1:3))
+
+M <- matrix(1:6, nrow=3, dimnames=list(rep("", 3), c("V1", "V2")))   #  test of list(M) for empty-rowname'd matrix input
+test(2330.8, as.data.table(list(M), keep.rownames=TRUE), data.table(rn=rep("", 3), V1=1:3, V2=4:6))
+
 #2606
-test(2329.1, {
-  dt1 <- data.table(a = 1)
-  lst <- list(inner = dt1)
-  res <- tables(recursive=TRUE)
+test(2331.1, {
+  dt1 = data.table(a = 1)
+  lst = list(inner = dt1)
+  res = tables(recursive=TRUE)
   any(res$NAME == "lst$inner")
 }, TRUE)
-test(2329.2, {
-  lst <- list(data.table(b = 2))
-  res <- tables(recursive=TRUE)
+test(2331.2, {
+  lst = list(data.table(b = 2))
+  res = tables(recursive=TRUE)
   any(grepl("^lst\\[\\[1\\]\\]$", res$NAME))
 }, TRUE)
-test(2329.3, {
-  nested <- list(l1 = list(l2 = data.table(c = 3)))
-  res <- tables(recursive=TRUE)
+test(2331.3, {
+  nested = list(l1 = list(l2 = data.table(c = 3)))
+  res = tables(recursive=TRUE)
   any(res$NAME == "nested$l1$l2")
 }, TRUE)
-test(2329.4, {
-  cycle <- list()
-  cycle[[1]] <- cycle
-  cycle[[2]] <- data.table(x = 1)
-  res <- tables(recursive=TRUE)
+test(2331.4, {
+  cycle = list()
+  cycle[[1]] = cycle
+  cycle[[2]] = data.table(x = 1)
+  res = tables(recursive=TRUE)
   any(res$NAME == "cycle[[2]]") && !"cycle[[1]]" %in% res$NAME
 }, TRUE)
diff --git a/man/as.data.table.Rd b/man/as.data.table.Rd
@@ -31,7 +31,7 @@ is.data.table(x)
 }
 \arguments{
   \item{x}{An R object.}
-  \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead.}
+  \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead. For lists and when calling \code{data.table()}, names from the first named vector are extracted and used as row names, similar to \code{data.frame()} behavior.}
   \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. }
   \item{sorted}{logical used in \emph{array} method, default \code{TRUE} is overridden when \code{key} is provided. }
   \item{value.name}{character scalar used in \emph{array} method, default \code{"value"}.}
diff --git a/man/data.table.Rd b/man/data.table.Rd
@@ -117,9 +117,11 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
 
     \item{keyby}{ Same as \code{by}, but with an additional \code{setkey()} run on the \code{by} columns of the result, for convenience. It is common practice to use \code{keyby=} routinely when you wish the result to be sorted. May also be \code{TRUE} or \code{FALSE} when \code{by} is provided as an alternative way to accomplish the same operation.}
 
-    \item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In case of overlapping variables names inside dataset and in parent scope you can use double dot prefix \code{..cols} to explicitly refer to \code{cols} variable parent scope and not from your dataset.
+    \item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In the case of overlapping variable names inside \code{x} and in parent scope, you can use the double dot prefix \code{..cols} to explicitly refer to the \code{cols} variable in parent scope and not from \code{x}.
 
-        When \code{j} is a character vector of column names, a numeric vector of column positions to select or of the form \code{startcol:endcol}, and the value returned is always a \code{data.table}. \code{with=FALSE} is not necessary anymore to select columns dynamically. Note that \code{x[, cols]} is equivalent to \code{x[, ..cols]} and to \code{x[, cols, with=FALSE]} and to \code{x[, .SD, .SDcols=cols]}.}
+        When \code{j} is a character vector of column names, a numeric vector of column positions to select, or of the form \code{startcol:endcol}, the value returned is always a \code{data.table}.
+
+        New code should rarely use this argument, which was originally needed for similarity to data.frame. For example, to select columns from a character vector \code{cols}, in data.frame we do \code{x[, cols]}, which has several equivalents in data.table: \code{x[, .SD, .SDcols=cols]}, \code{x[, ..cols]}, \code{x[, cols, env = list(cols = I(cols))]}, or \code{x[, cols, with=FALSE]}.}
 
     \item{nomatch}{ When a row in \code{i} has no match to \code{x}, \code{nomatch=NA} (default) means \code{NA} is returned. \code{NULL} (or \code{0} for backward compatibility) means no rows will be returned for that row of \code{i}. }
 
diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd

Original file line number	Diff line number	Diff line change
`@@ -141,6 +141,8 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),`
`141`	`141`	`print_default(toprint)`
`142`	`142`	`return(invisible(x))`
`143`	`143`	`}`
	`144`	`+ if (col.names == "none")`
	`145`	`+ colnames(toprint) = rep.int("", ncol(toprint))`
`144`	`146`	`if (nrow(toprint)>20L && col.names == "auto")`
`145`	`147`	`# repeat colnames at the bottom if over 20 rows so you don't have to scroll up to see them`
`146`	`148`	`# option to shut this off per request of Oleg Bondar on SO, #1482`
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ is.data.table(x)`
`31`	`31`	`}`
`32`	`32`	`\arguments{`
`33`	`33`	`\item{x}{An R object.}`
`34`		`- \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead.}`
	`34`	`+ \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead. For lists and when calling \code{data.table()}, names from the first named vector are extracted and used as row names, similar to \code{data.frame()} behavior.}`
`35`	`35`	`\item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. }`
`36`	`36`	`\item{sorted}{logical used in \emph{array} method, default \code{TRUE} is overridden when \code{key} is provided. }`
`37`	`37`	`\item{value.name}{character scalar used in \emph{array} method, default \code{"value"}.}`