Merge remote-tracking branch 'origin/master' into merge_factor_char_key

MichaelChirico · MichaelChirico · commit dda87222634a · 2025-06-30T18:42:45.000Z
diff --git a/NEWS.md b/NEWS.md
@@ -70,7 +70,9 @@
 
 14. `data.table()` function is now more aligned with `data.frame()` with respect to the names of the output when one of its inputs is a single-column matrix object, [#4124](https://github.com/Rdatatable/data.table/issues/4124). Thanks @PavoDive for the report and @jangorecki for the PR.
 
-15. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
+15. Including an `ITime` object as a named input to `data.frame()` respects the provided name, i.e. `data.frame(a = as.ITime(...))` will have column `a`, [#4673](https://github.com/Rdatatable/data.table/issues/4673). Thanks @shrektan for the report and @MichaelChirico for the fix.
+
+16. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
 
 ### NOTES
 
diff --git a/R/IDateTime.R b/R/IDateTime.R
@@ -209,7 +209,7 @@ as.character.ITime = format.ITime = function(x, ...) {
   res
 }
 
-as.data.frame.ITime = function(x, ...) {
+as.data.frame.ITime = function(x, ..., optional=FALSE) {
   # This method is just for ggplot2, #1713
   # Avoids the error "cannot coerce class '"ITime"' into a data.frame", but for some reason
   # ggplot2 doesn't seem to call the print method to get axis labels, so still prints integers.
@@ -219,7 +219,8 @@ as.data.frame.ITime = function(x, ...) {
   # ans = list(as.POSIXct(x,tzone=""))  # ggplot2 gives "Error: Discrete value supplied to continuous scale"
   setattr(ans, "class", "data.frame")
   setattr(ans, "row.names", .set_row_names(length(x)))
-  setattr(ans, "names", "V1")
+  # require 'optional' support for passing back to e.g. data.frame() without overriding names there
+  if (!optional) setattr(ans, "names", "V1")
   ans
 }
 
diff --git a/R/data.table.R b/R/data.table.R
@@ -221,7 +221,7 @@ replace_dot_alias = function(e) {
     }
     return(x)
   }
-  if (!mult %chin% c("first","last","all")) stopf("mult argument can only be 'first', 'last' or 'all'")
+  if (!mult %chin% c("first", "last", "all")) stopf("mult argument can only be 'first', 'last' or 'all'")
   missingroll = missing(roll)
   if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
   if (is.character(roll)) {
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -21290,3 +21290,9 @@ test(2322.12, levels(fctr(c("b","a","c"), rev=NA)), error="TRUE or FALSE")
 test(2322.21, levels(fctr(c("b","a","c"), sort=TRUE)), c("a","b","c"))
 test(2322.22, levels(fctr(c("b","a","c"), sort=NA)), error="TRUE or FALSE")
 test(2322.31, levels(fctr(c("b","a","c"), rev=TRUE, sort=TRUE)), c("c","b","a"))
+
+# data.frame() uses provided names of ITime inputs
+it <- as.ITime('00:00:00')
+test(2323.1, names(data.frame(COL = it)), "COL")
+test(2323.2, names(data.frame(b = 1, COL = it)), c("b", "COL"))
+test(2323.3, names(as.data.frame(it, optional=TRUE)), NULL)
diff --git a/src/bmerge.c b/src/bmerge.c
@@ -49,8 +49,10 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   // iArg, xArg, icolsArg and xcolsArg
   idtVec = SEXPPTR_RO(idt);  // set globals so bmerge_r can see them.
   xdtVec = SEXPPTR_RO(xdt);
-  if (!isInteger(icolsArg)) internal_error(__func__, "icols is not integer vector"); // # nocov
-  if (!isInteger(xcolsArg)) internal_error(__func__, "xcols is not integer vector"); // # nocov
+  if (!isInteger(icolsArg))
+    internal_error(__func__, "icols is not integer vector"); // # nocov
+  if (!isInteger(xcolsArg))
+    internal_error(__func__, "xcols is not integer vector"); // # nocov
   if ((LENGTH(icolsArg)==0 || LENGTH(xcolsArg)==0) && LENGTH(idt)>0) // We let through LENGTH(i) == 0 for tests 2126.*
     internal_error(__func__, "icols and xcols must be non-empty integer vectors");
   if (LENGTH(icolsArg) > LENGTH(xcolsArg)) internal_error(__func__, "length(icols) [%d] > length(xcols) [%d]", LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov
@@ -60,10 +62,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   iN = ilen = anslen = LENGTH(idt) ? LENGTH(VECTOR_ELT(idt,0)) : 0;
   ncol = LENGTH(icolsArg);    // there may be more sorted columns in x than involved in the join
   for(int col=0; col<ncol; col++) {
-    if (icols[col]==NA_INTEGER) internal_error(__func__, "icols[%d] is NA", col); // # nocov
-    if (xcols[col]==NA_INTEGER) internal_error(__func__, "xcols[%d] is NA", col); // # nocov
-    if (icols[col]>LENGTH(idt) || icols[col]<1) error(_("icols[%d]=%d outside range [1,length(i)=%d]"), col, icols[col], LENGTH(idt));
-    if (xcols[col]>LENGTH(xdt) || xcols[col]<1) error(_("xcols[%d]=%d outside range [1,length(x)=%d]"), col, xcols[col], LENGTH(xdt));
+    if (icols[col]==NA_INTEGER)
+      internal_error(__func__, "icols[%d] is NA", col); // # nocov
+    if (xcols[col]==NA_INTEGER)
+      internal_error(__func__, "xcols[%d] is NA", col); // # nocov
+    if (icols[col]>LENGTH(idt) || icols[col]<1)
+      internal_error(__func__, "icols[%d]=%d outside range [1,length(i)=%d]", col, icols[col], LENGTH(idt)); // # nocov. Should have been caught already.
+    if (xcols[col]>LENGTH(xdt) || xcols[col]<1)
+      internal_error(__func__, "xcols[%d]=%d outside range [1,length(x)=%d]", col, xcols[col], LENGTH(xdt)); // # nocov
     int it = TYPEOF(VECTOR_ELT(idt, icols[col]-1));
     int xt = TYPEOF(VECTOR_ELT(xdt, xcols[col]-1));
     if (iN && it!=xt)
@@ -75,11 +81,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   // rollArg, rollendsArg
   roll = 0.0; rollToNearest = FALSE;
   if (isString(rollarg)) {
-    if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error(_("roll is character but not 'nearest'"));
-    if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet."));
+    if (strcmp(CHAR(STRING_ELT(rollarg, 0)), "nearest") != 0)
+      internal_error(__func__, "roll is character but not 'nearest'"); // # nocov. Only [.data.table exposes roll= directly, and this is already checked there.
+    if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP)
+      error(_("roll='nearest' can't be applied to a character column, yet."));
     roll=1.0; rollToNearest=TRUE;       // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later
   } else {
-    if (!isReal(rollarg)) internal_error(__func__, "roll is not character or double"); // # nocov
+    if (!isReal(rollarg))
+      internal_error(__func__, "roll is not character or double"); // # nocov
     roll = REAL(rollarg)[0];   // more common case (rolling forwards or backwards) or no roll when 0.0
   }
   rollabs = fabs(roll);
@@ -98,10 +107,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   }
 
   // mult arg
-  if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL;
-  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) mult = FIRST;
-  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) mult = LAST;
-  else internal_error(__func__, "invalid value for 'mult'"); // # nocov
+  if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all"))
+    mult = ALL;
+  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first"))
+    mult = FIRST;
+  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last"))
+    mult = LAST;
+  else
+    internal_error(__func__, "invalid value for 'mult'"); // # nocov
 
   // opArg
   if (!isInteger(opArg) || length(opArg)!=ncol)
@@ -132,7 +145,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
     retLength = R_Calloc(anslen, int);
     retIndex = R_Calloc(anslen, int);
     // initialise retIndex here directly, as next loop is meant for both equi and non-equi joins
-    for (int j=0; j<anslen; j++) retIndex[j] = j+1;
+    for (int j=0; j<anslen; j++)
+      retIndex[j] = j+1;
   } else { // equi joins (or) non-equi join but no multiple matches
     retFirstArg = PROTECT(allocVector(INTSXP, anslen));
     retFirst = INTEGER(retFirstArg);
@@ -145,9 +159,11 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   for (int j=0; j<anslen; j++) {
     // defaults need to populated here as bmerge_r may well not touch many locations, say if the last row of i is before the first row of x.
     retFirst[j] = nomatch;   // default to no match for NA goto below
-    // retLength[j] = 0;   // TO DO: do this to save the branch below and later branches at R level to set .N to 0
-    retLength[j] = nomatch==0 ? 0 : 1;
   }
+  // retLength[j] = 0;   // TO DO: do this to save the branch below and later branches at R level to set .N to 0
+  int retLengthVal = (int)(nomatch != 0);
+  for (int j=0; j<anslen; j++)
+    retLength[j] = retLengthVal;
 
   // allLen1Arg
   allLen1Arg = PROTECT(allocVector(LGLSXP, 1));
@@ -174,7 +190,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   // xo arg
   xo = NULL;
   if (length(xoArg)) {
-    if (!isInteger(xoArg)) internal_error(__func__, "xoArg is not an integer vector"); // # nocov
+    if (!isInteger(xoArg))
+      internal_error(__func__, "xoArg is not an integer vector"); // # nocov
     xo = INTEGER(xoArg);
   }
 
@@ -391,10 +408,13 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisg
       // final two 1's are lowmax and uppmax
     } else {
       int len = xupp-xlow-1+rollLow+rollUpp; // rollLow and rollUpp cannot both be true
-      if (mult==ALL && len>1) allLen1[0] = FALSE;
+      if (len>1) {
+        if (mult==ALL)
+          allLen1[0] = FALSE;                           // bmerge()$allLen1
+      }
       if (nqmaxgrp == 1) {
-        const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // extra +1 for 1-based indexing at R level
-        const int rl = (mult==ALL) ? len : 1;
+        const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // bmerge()$starts thus extra +1 for 1-based indexing at R level
+        const int rl = (mult==ALL) ? len : 1;                        // bmerge()$lens
         for (int j=ilow+1; j<iupp; j++) {   // usually iterates once only for j=ir
           const int k = o ? o[j]-1 : j;
           retFirst[k] = rf;
diff --git a/src/fread.c b/src/fread.c
@@ -193,6 +193,7 @@ bool freadCleanup(void)
 static inline uint64_t umax(uint64_t a, uint64_t b) { return a > b ? a : b; }
 static inline uint64_t umin(uint64_t a, uint64_t b) { return a < b ? a : b; }
 static inline  int64_t imin( int64_t a,  int64_t b) { return a < b ? a : b; }
+static inline   int iminInt(     int a,      int b) { return a < b ? a : b; }
 
 /** Return value of `x` clamped to the range [upper, lower] */
 static inline int64_t clamp_i64t(int64_t x, int64_t lower, int64_t upper) {
@@ -2238,9 +2239,9 @@ int freadMain(freadMainArgs _args) {
   double thRead = 0, thPush = 0;  // reductions of timings within the parallel region
   int max_col = 0;
   char *typeBumpMsg = NULL; size_t typeBumpMsgSize = 0;
-  int typeCounts[NUMTYPE]; // used for verbose output; needs populating after first read and before reread (if any) -- see later comment
-  #define internalErrSize 1000
-  char internalErr[internalErrSize+1]="";  // must be compile time size: the message is generated and we can't free before STOP
+  int typeCounts[NUMTYPE];  // used for verbose output; needs populating after first read and before reread (if any) -- see later comment
+  char internalErr[256] = "";  // must be compile time size: the message is generated and we can't free before STOP
+
   int64_t DTi = 0;                  // the current row number in DT that we are writing to
   const char *headPos = pos;       // the jump start corresponding to DTi
   int nSwept = 0;                  // count the number of dirty jumps that were swept
@@ -2296,7 +2297,7 @@ int freadMain(freadMainArgs _args) {
         nth = omp_get_num_threads();
         if (me != 0) {
           // # nocov start
-          snprintf(internalErr, internalErrSize, "Master thread is not thread 0 but thread %d.\n", me); // # notranslate
+          snprintf(internalErr, sizeof(internalErr), "Master thread is not thread 0 but thread %d.\n", me); // # notranslate
           stopTeam = true;
           // # nocov end
         }
@@ -2518,18 +2519,19 @@ int freadMain(freadMainArgs _args) {
                 // Can't print because we're likely not master. So accumulate message and print afterwards.
                 if (thisType < joldType) {   // thisType<0 (type-exception)
                   if (verbose) {
-                    char temp[1001];
-                    int len = snprintf(temp, 1000,
+                    char buffer[256];
+                    int len = snprintf(buffer, sizeof(buffer),
                       _("Column %d%s%.*s%s bumped from '%s' to '%s' due to <<%.*s>> on row %"PRId64"\n"),
                       j + 1, colNames ? " <<" : "", colNames ? (colNames[j].len) : 0, colNames ? (colNamesAnchor + colNames[j].off) : "", colNames ? ">>" : "",
                       typeName[IGNORE_BUMP(joldType)], typeName[IGNORE_BUMP(thisType)],
                       (int)(tch - fieldStart), fieldStart, (int64_t)(ctx.DTi + myNrow));
-                    if (len > 1000) len = 1000;
-                    if (len > 0) {
-                      typeBumpMsg = realloc(typeBumpMsg, typeBumpMsgSize + len + 1);
-                      strcpy(typeBumpMsg + typeBumpMsgSize, temp);
-                      typeBumpMsgSize += len;
-                    }
+                    
+                    len = iminInt(len, sizeof(buffer));
+
+                    typeBumpMsg = realloc(typeBumpMsg, typeBumpMsgSize + len + 1);
+                    strcpy(typeBumpMsg + typeBumpMsgSize, buffer);
+                    typeBumpMsgSize += len;
+
                   }
                   nTypeBump++;
                   if (joldType > 0) nTypeBumpCols++;
@@ -2570,7 +2572,7 @@ int freadMain(freadMainArgs _args) {
           }
           else if (headPos != thisJumpStart && nrowLimit > 0) { // do not care for dirty jumps since we do not read data and only want to know types
              // # nocov start
-            snprintf(internalErr, internalErrSize, "invalid head position. jump=%d, headPos=%p, thisJumpStart=%p, sof=%p", jump, headPos, thisJumpStart, sof); // # notranslate
+            snprintf(internalErr, sizeof(internalErr), "invalid head position. jump=%d, headPos=%p, thisJumpStart=%p, sof=%p", jump, headPos, thisJumpStart, sof); // # notranslate
             stopTeam = true;
             // # nocov end
           }
diff --git a/src/vecseq.c b/src/vecseq.c
@@ -10,9 +10,12 @@ SEXP vecseq(SEXP x, SEXP len, SEXP clamp)
   // Specially for use by [.data.table after binary search. Now so specialized that for general use
   // bit::vecseq is recommended (Jens has coded it in C now).
 
-  if (!isInteger(x)) error(_("x must be an integer vector"));
-  if (!isInteger(len)) error(_("len must be an integer vector"));
-  if (LENGTH(x) != LENGTH(len)) error(_("x and len must be the same length"));
+  if (!isInteger(x))
+    error(_("x must be an integer vector")); // # nocov
+  if (!isInteger(len))
+    error(_("len must be an integer vector")); // # nocov
+  if (LENGTH(x) != LENGTH(len))
+    error(_("x and len must be the same length")); // # nocov
   const int *ix = INTEGER(x);
   const int *ilen = INTEGER(len), nlen=LENGTH(len);
   int reslen = 0;
@@ -22,10 +25,13 @@ SEXP vecseq(SEXP x, SEXP len, SEXP clamp)
     reslen += ilen[i];
   }
   if (!isNull(clamp)) {
-    if (!isNumeric(clamp) || LENGTH(clamp)!=1) error(_("clamp must be a double vector length 1"));
+    if (!isNumeric(clamp) || LENGTH(clamp)!=1)
+      error(_("clamp must be a double vector length 1")); // # nocov
     double limit = REAL(clamp)[0];
-    if (limit<0) error(_("clamp must be positive"));
-    if (reslen>limit) error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit);
+    if (limit<0)
+      error(_("clamp must be positive")); // # nocov
+    if (reslen>limit)
+      error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit);
   }
   SEXP ans = PROTECT(allocVector(INTSXP, reslen));
   int *ians = INTEGER(ans);

Original file line number	Diff line number	Diff line change
`@@ -221,7 +221,7 @@ replace_dot_alias = function(e) {`
`221`	`221`	`}`
`222`	`222`	`return(x)`
`223`	`223`	`}`
`224`		`- if (!mult %chin% c("first","last","all")) stopf("mult argument can only be 'first', 'last' or 'all'")`
	`224`	`+ if (!mult %chin% c("first", "last", "all")) stopf("mult argument can only be 'first', 'last' or 'all'")`
`225`	`225`	`missingroll = missing(roll)`
`226`	`226`	`if (length(roll)!=1L \|\| is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")`
`227`	`227`	`if (is.character(roll)) {`