Add an option for enabling new data.table(<1-column matrix>) auto-naming behavior (#7158)

MichaelChirico · web-flow · commit 2675539446ca · 2025-07-14T22:36:37.000-07:00
* implement option, add regresion tests * regression test for dup-named case * fix for updated behavior * need option set for original tests * refine wording * item number fix * Regression test for having fixed #5367 * correct NEWS reference * include other fixed bugs in NEWS * fix test numbering
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,10 @@
 
 ## data.table [v1.17.99](https://github.com/Rdatatable/data.table/milestone/35)  (in development)
 
+### NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES 
+
+1. `data.table(x=1, <expr>)`, where `<expr>` is an expression resulting in a 1-column matrix without column names, will eventually have names `x` and `V2`, not `x` and `V1`, consistent with `data.table(x=1, <expr>)` where `<expr>` results in an atomic vector, for example `data.table(x=1, cbind(1))` and `data.table(x=1, 1)` will both have columns named `x` and `V2`. In this release, the matrix case continues to be named `V1`, but the new behavior can be activated by setting `options(datatable.old.matrix.autoname)` to `FALSE`. See point 5 under Bug Fixes for more context; this change will provide more internal consistency as well as more consistency with `data.frame()`.
+
 ### NEW FEATURES
 
 1. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also match `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR.
@@ -62,7 +66,7 @@
 
 4. In rare cases, `data.table` failed to expand ALTREP columns when assigning a full column by reference. This could result in the target column getting modified unintentionally if the next call to the data.table was a modification by reference of the source column. E.g. in `DT[, b := as.character(a)]` the string conversion gets deferred and subsequent modification of column `a` would also modify column `b`, [#5400](https://github.com/Rdatatable/data.table/issues/5400). Thanks to @aquasync for the report and Václav Tlapák for the PR.
 
-5. `data.table()` function is now more aligned with `data.frame()` with respect to the names of the output when one of its inputs is a single-column matrix object, [#4124](https://github.com/Rdatatable/data.table/issues/4124). Thanks @PavoDive for the report, @jangorecki for the PR, and @MichaelChirico for a follow-up for back-compatibility.
+5. `data.table()` function is now more aligned with `data.frame()` with respect to the names of the output when one of its inputs is a single-column matrix object, [#4124](https://github.com/Rdatatable/data.table/issues/4124), [#3193](https://github.com/Rdatatable/data.table/issues/3193), and [#5367](https://github.com/Rdatatable/data.table/issues/5367). Thanks @PavoDive for the report, @jangorecki for the PR, and @MichaelChirico for a follow-up for back-compatibility.
 
 6. Including an `ITime` object as a named input to `data.frame()` respects the provided name, i.e. `data.frame(a = as.ITime(...))` will have column `a`, [#4673](https://github.com/Rdatatable/data.table/issues/4673). Thanks @shrektan for the report and @MichaelChirico for the fix.
 
diff --git a/R/as.data.table.R b/R/as.data.table.R
@@ -50,7 +50,7 @@ as.data.table.matrix = function(x, keep.rownames=FALSE, key=NULL, ...) {
     ans = data.table(rn=rownames(x), x, keep.rownames=FALSE)
     # auto-inferred name 'x' is not back-compatible & inconsistent, #7145
     if (ncol(x) == 1L && is.null(colnames(x)))
-      setnames(ans, 'x', 'V1')
+      setnames(ans, 'x', 'V1', skip_absent=TRUE)
     if (is.character(keep.rownames))
       setnames(ans, 'rn', keep.rownames[1L])
     return(ans)
@@ -162,7 +162,7 @@ as.data.table.list = function(x,
       xi = x[[i]] = as.POSIXct(xi)
     } else if (is.matrix(xi) || is.data.frame(xi)) {
       if (!is.data.table(xi)) {
-        if (is.matrix(xi) && NCOL(xi)<=1L && is.null(colnames(xi))) { # 1 column matrix naming #4124
+        if (is.matrix(xi) && NCOL(xi)==1L && is.null(colnames(xi)) && isFALSE(getOption('datatable.old.matrix.autoname'))) { # 1 column matrix naming #4124
           xi = x[[i]] = c(xi)
         } else {
           xi = x[[i]] = as.data.table(xi, keep.rownames=keep.rownames)  # we will never allow a matrix to be a column; always unpack the columns
diff --git a/R/onLoad.R b/R/onLoad.R
@@ -73,7 +73,8 @@
   # In fread and fwrite we have moved back to using getOption's default argument since it is unlikely fread and fread will be called in a loop many times, plus they
   # are relatively heavy functions where the overhead in getOption() would not be noticed.  It's only really [.data.table where getOption default bit.
   # Improvement to base::getOption() now submitted (100x; 5s down to 0.05s):  https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17394
-  opts = c("datatable.verbose"="FALSE",        # datatable.<argument name>
+  opts = c(
+       "datatable.verbose"="FALSE",            # datatable.<argument name>
        "datatable.optimize"="Inf",             # datatable.<argument name>
        "datatable.print.nrows"="100L",         # datatable.<argument name>
        "datatable.print.topn"="5L",            # datatable.<argument name>
@@ -85,12 +86,14 @@
        "datatable.show.indices"="FALSE",       # for print.data.table
        "datatable.allow.cartesian"="FALSE",    # datatable.<argument name>
        "datatable.join.many"="TRUE",           # mergelist, [.data.table #4383 #914
-       "datatable.dfdispatchwarn"="TRUE",                   # not a function argument
-       "datatable.warnredundantby"="TRUE",                  # not a function argument
+       "datatable.dfdispatchwarn"="TRUE",      # not a function argument
+       "datatable.warnredundantby"="TRUE",     # not a function argument
        "datatable.alloccol"="1024L",           # argument 'n' of alloc.col. Over-allocate 1024 spare column slots
        "datatable.auto.index"="TRUE",          # DT[col=="val"] to auto add index so 2nd time faster
        "datatable.use.index"="TRUE",           # global switch to address #1422
-       "datatable.prettyprint.char" = NULL     # FR #1091
+       "datatable.prettyprint.char" = NULL,    # FR #1091
+       "datatable.old.matrix.autoname"="TRUE", # #7145: how data.table(x=1, matrix(1)) is auto-named set to change
+       NULL
        )
   for (i in setdiff(names(opts),names(options()))) {
     eval(parse(text=paste0("options(",i,"=",opts[i],")")))
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -21277,10 +21277,15 @@ if (test_R.utils) local({
 })
 
 # Create a data.table when one vector is transposed doesn't respect the name defined by user #4124
-test(2321.01, DT <- data.table(a=1:2, b=matrix(1:2)), data.table(a=1:2, b=1:2))
-test(2321.02, names(DT), names(data.frame(a=1:2, b=matrix(1:2))))
-test(2321.03, DT <- data.table(a=integer(), b=matrix(1L, nrow=0L, ncol=1L)), data.table(a=integer(), b=integer()))
-test(2321.04, names(DT), names(data.frame(a=integer(), b=matrix(1L, nrow=0L, ncol=1L))))
+local({
+  old = options(datatable.old.matrix.autoname=FALSE)
+  on.exit(options(old))
+
+  test(2321.01, DT <- data.table(a=1:2, b=matrix(1:2)), data.table(a=1:2, b=1:2))
+  test(2321.02, names(DT), names(data.frame(a=1:2, b=matrix(1:2))))
+  test(2321.03, DT <- data.table(a=integer(), b=matrix(1L, nrow=0L, ncol=1L)), data.table(a=integer(), b=integer()))
+  test(2321.04, names(DT), names(data.frame(a=integer(), b=matrix(1L, nrow=0L, ncol=1L))))
+})
 ## but respect named column vectors
 test(2321.05, DT <- data.table(a=1:2, cbind(b=3:4)), data.table(a=1:2, b=3:4))
 test(2321.06, names(DT), names(data.frame(a=1:2, cbind(b=3:4))))
@@ -21318,6 +21323,30 @@ colnames(M) = c('A', '')
 test(2321.26, as.data.table(M), data.table(A=1:3, V2=4:6))
 test(2321.27, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), A=1:3, V2=4:6))
 
+# also respect old auto-naming rules by default (to be deprecated)
+test(2321.28, names(data.table(a=1, cbind(2), c=3, 4)), c("a", "V1", "c", "V4"))
+test(2321.29, names(data.table(cbind(1), cbind(2))), c("V1", "V1"))
+# also test behavior with a 0-column matrix
+M = cbind(1:3)
+test(2321.30, data.table(M[, 0L]), data.table(NULL))
+test(2321.31, data.table(a=1:3, M[, 0L]), data.table(a=1:3))
+
+local({
+  old = options(datatable.old.matrix.autoname=FALSE)
+  on.exit(options(old))
+  
+  test(2321.32, names(data.table(a=1, cbind(2), c=3, 4)), c("a", "V2", "c", "V4"))
+  # particularly buggy old behavior: can easily result in duplicate names
+  test(2321.33, names(data.table(cbind(1), cbind(2))), c("V1", "V2"))
+  M = cbind(1:3)
+  test(2321.34, data.table(M[, 0L]), data.table(NULL))
+  test(2321.35, data.table(a=1:3, M[, 0L]), data.table(a=1:3))
+
+  # a more subtle version of this as expressed in #5367
+  DT <- data.table(Counts=c(10, 20), Severity=c(1, 2))
+  test(2321.36, names(DT[,.(New_name = Severity %*% Counts)]), "New_name")
+})
+
 # New fctr() helper: like factor() but retaining order by default #4837
 test(2322.01, levels(fctr(c("b","a","c"))), c("b","a","c"))
 test(2322.02, levels(fctr(c(3,1,2))), c("3","1","2"))
@@ -21423,10 +21452,12 @@ DF <- data.frame(row.names = letters[1:6], V = 1:6)     # Test data.frame with e
 test(2330.6, as.data.table(list(a = 6:1, DF), keep.rownames=TRUE), data.table(rn=letters[1:6], a=6:1, V=1:6))
 
 z <- setNames(1:3, rep("", 3))  # vector with all-empty names     # behaviour with all-empty row names
-test(2330.7, as.data.table(list(z), keep.rownames=TRUE), data.table(rn=rep("", 3), V1=1:3))
+test(2330.7, as.data.table(list(z), keep.rownames=TRUE), data.table(rn="", V1=1:3))
 
-M <- matrix(1:6, nrow=3, dimnames=list(rep("", 3), c("V1", "V2")))   #  test of list(M) for empty-rowname'd matrix input
-test(2330.8, as.data.table(list(M), keep.rownames=TRUE), data.table(rn=rep("", 3), V1=1:3, V2=4:6))
+M <- matrix(1:6, nrow=3, dimnames=list(rep("", 3L), c("V1", "V2")))   #  test of list(M) for empty-rowname'd matrix input
+test(2330.8, as.data.table(list(M), keep.rownames=TRUE), data.table(rn="", V1=1:3, V2=4:6))
+# 0-column input can still provide rownames
+test(2330.9, as.data.table(list(M[, 0L], 1:3), keep.rownames=TRUE), data.table(rn="", V2=1:3))
 
 # .SD reference in '...' passed to lapply(FUN=) is recognized as data.table
 test(2331, lapply(list(data.table(a=1:2)), `[`, j=.SD[1L]), list(data.table(a=1L)))
diff --git a/man/data.table-options.Rd b/man/data.table-options.Rd
@@ -108,6 +108,15 @@
   }
 }
 
+\section{Back-compatibility Options}{
+  \describe{
+    \item{\code{datatable.old.matrix.autoname}}{Logical, default \code{TRUE}. Governs how the output of
+      expressions like \code{data.table(x=1, cbind(1))} will be named. When \code{TRUE}, it will be named
+      \code{V1}, otherwise it will be named \code{V2}.
+    }
+  }
+}
+
 \seealso{
   \code{\link[base]{options}},
   \code{\link[base]{getOption}},