Merge branch 'master' into issue6964

venom1204 · web-flow · commit 05a6c296cb8f · 2025-07-03T00:26:58.000+05:30
diff --git a/NEWS.md b/NEWS.md
@@ -78,8 +78,10 @@
 
 17. A data.table with a column of class `vctrs_list_of` (from package {vctrs}) prints as expected, [#5948](https://github.com/Rdatatable/data.table/issues/5948). Before, they could be printed messily, e.g. printing every entry in a nested data.frame. Thanks @jesse-smith for the report, @DavisVaughan and @r2evans for contributing, and @MichaelChirico for the PR.
 
-18. Spurious warnings from internal code in `cube()`, `rollup()`, and `groupingsets()` are no longer surfaced to the caller, [#6964](https://github.com/Rdatatable/data.table/issues/6964). Thanks @ferenci-tamas for the report and @venom1204 for the fix.
+18. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
 
+19. Spurious warnings from internal code in `cube()`, `rollup()`, and `groupingsets()` are no longer surfaced to the caller, [#6964](https://github.com/Rdatatable/data.table/issues/6964). Thanks @ferenci-tamas for the report and @venom1204 for the fix.
+ 
 ### NOTES
 
 1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.
diff --git a/R/data.table.R b/R/data.table.R
@@ -1348,21 +1348,8 @@ replace_dot_alias = function(e) {
         ans[icolsAns] = .Call(CsubsetDT, i, ii,    icols)
         ans[xcolsAns] = .Call(CsubsetDT, x, irows, xcols)
         setattr(ans, "names", ansvars)
-        if (haskey(x)) {
-          keylen = which.first(!key(x) %chin% ansvars)-1L
-          if (is.na(keylen)) keylen = length(key(x))
-          len = length(rightcols)
-          # fix for #1268, #1704, #1766 and #1823
-          chk = if (len && !missing(on)) !identical(head(key(x), len), names(on)) else FALSE
-          if ( (keylen>len || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
-            keylen = if (!chk) len else 0L # fix for #1268
-          }
-          ## check key on i as well!
-          ichk = is.data.table(i) && haskey(i) &&
-                 identical(head(key(i), length(leftcols)), names_i[leftcols]) # i has the correct key, #3061
-          if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
-            setattr(ans,"sorted",head(key(x),keylen))
-        }
+        # NB: could be NULL
+        setattr(ans, "sorted", .join_result_key(x, i, ans, if (!missing(on)) names(on), ansvars, leftcols, rightcols, names_i, irows, roll))
         setattr(ans, "class", class(x))  # retain class that inherits from data.table, #64
         setattr(ans, "row.names", .set_row_names(length(ans[[1L]])))
         setalloccol(ans)
@@ -2034,6 +2021,48 @@ replace_dot_alias = function(e) {
   setalloccol(ans)   # TODO: overallocate in dogroups in the first place and remove this line
 }
 
+# can the specified merge of x and i be marked as sorted? return the columns for which this is true, otherwise NULL
+.join_result_key <- function(x, i, ans, on_lhs, ansvars, leftcols, rightcols, names_i, irows, roll) {
+  x_key <- key(x)
+  if (is.null(x_key))
+    return(NULL)
+
+  key_length = which.first(!x_key %chin% ansvars) - 1L
+  if (is.na(key_length))
+    key_length = length(x_key)
+
+  rhs_length = length(rightcols)
+  # fix for #1268, #1704, #1766 and #1823
+  chk = rhs_length && !is.null(on_lhs) && !identical(head(x_key, rhs_length), on_lhs)
+  if ( (key_length > rhs_length || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
+    key_length = if (chk) 0L else rhs_length # fix for #1268
+  }
+
+  if (!key_length)
+    return(NULL)
+
+  # i has the correct key, #3061
+  if (identical(head(key(i), length(leftcols)), names_i[leftcols]))
+    return(head(x_key, key_length))
+
+  if (!.Call(CisOrderedSubset, irows, nrow(x)))
+    return(NULL)
+
+  # see #1010. don't set key when i has no key, but irows is ordered and !roll
+  if (roll && length(irows) != 1L)
+    return(NULL)
+
+  new_key <- head(x_key, key_length)
+
+  #5361 merging on keyed factor with character, check if resulting character is really sorted
+  if (identical(vapply_1c(.shallow(i, leftcols), typeof), vapply_1c(.shallow(x, rightcols), typeof)))
+    return(new_key)
+
+  if (!is.sorted(ans, by=new_key))
+    return(NULL)
+  new_key
+}
+
 # What's the name of the top-level call in 'j'?
 # NB: earlier, we used 'as.character()' but that fails for closures/builtins (#6026).
 root_name = function(jsub) if (is.call(jsub)) paste(deparse(jsub[[1L]]), collapse = " ") else ""
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -7077,6 +7077,50 @@ test(1483.3, merge(x,y,by="country",all=TRUE), data.table(country=factor(c("US",
 setkey(y)
 test(1483.4, y[x], data.table(country="US", key="country"))
 
+# 5361 merge on character and factor should only have key(x) if result is really sorted
+lett = c("a", "b", "c")
+rlet = c("c", "b", "a")
+x = data.table(i=rlet)
+y = data.table(i=factor(lett, levels=rlet), key="i")
+test(1483.51, x[y, on="i"], x)
+test(1483.52, y[x, on="i"], x)
+test(1483.53, merge(x, y, by="i"), data.table(i=lett, key="i"))
+test(1483.54, merge(y, x, by="i"), data.table(i=lett, key="i"))
+x = data.table(i1=1:3, i2=rlet)
+y = data.table(i1=1:3, i2=factor(lett, levels=rlet), key=c("i1", "i2"))
+test(1483.55, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
+test(1483.56, y[x, on=c("i1", "i2")], x)
+test(1483.57, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
+test(1483.58, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
+
+x = data.table(i=rlet, key="i")
+y = data.table(i=factor(lett, levels=rlet))
+test(1483.61, x[y, on="i"], x)
+test(1483.62, y[x, on="i"], data.table(i=lett))
+test(1483.63, merge(x, y, by="i"), data.table(i=lett, key="i"))
+test(1483.64, merge(y, x, by="i"), data.table(i=lett, key="i"))
+x = data.table(i1=1:3, i2=rlet, key=c("i1", "i2"))
+y = data.table(i1=1:3, i2=factor(lett, levels=rlet))
+test(1483.65, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
+test(1483.66, y[x, on=c("i1", "i2")], data.table(i1=1:3, i2=rlet))
+test(1483.67, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
+test(1483.68, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
+
+x = data.table(i=rlet, a=rlet)
+y = data.table(i=factor(lett, levels=rlet), b=lett, key="i")
+test(1483.71, x[y, on="i"], data.table(i=rlet, a=rlet, b=rlet))
+test(1483.72, y[x, on="i"], data.table(i=rlet, b=rlet, a=rlet))
+test(1483.73, merge(x, y, by="i"), data.table(i=lett, a=lett, b=lett, key="i"))
+test(1483.74, merge(y, x, by="i"), data.table(i=lett, b=lett, a=lett, key="i"))
+
+some_letters <- c("c", "b", "a")
+some_more_letters <- rep(c("a", "b", "c"), 2L)
+dt1 <- data.table(x = some_letters, y=1:3)
+dt2 <- data.table(x = factor(some_more_letters, levels=some_letters), z=1:6, key=c("x", "z"))
+dt3 <- merge(dt1, dt2, by="x")
+test(1483.81, key(dt3), "x")
+test(1483.82, nrow(dt3[x %in% "c", ]), 2L)
+
 # NULL items should be removed when making data.table from list, #842
 # Original fix for #842 added a branch in as.data.table.list() using point()
 # Then PR#3471 moved logic from data.table() into as.data.table.list() and now removes NULL items up front, so longer need for the branch
diff --git a/src/assign.c b/src/assign.c
@@ -735,12 +735,11 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
   return(dt);  // needed for `*tmp*` mechanism (when := isn't used), and to return the new object after a := for compound syntax.
 }
 
-#define MSGSIZE 1000
-static char memrecycle_message[MSGSIZE+1]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects
+static char memrecycle_message[1000]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects
 
 const char *columnDesc(int colnum, const char *colname) {
-  static char column_desc[MSGSIZE+1]; // can contain column names, hence relatively large allocation.
-  snprintf(column_desc, MSGSIZE, _("(column %d named '%s')"), colnum, colname);
+  static char column_desc[sizeof(memrecycle_message)]; // can contain column names, hence relatively large allocation.
+  snprintf(column_desc, sizeof(memrecycle_message), _("(column %d named '%s')"), colnum, colname);
   return column_desc;
 }
 
@@ -941,7 +940,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
         if (COND) {                                                                                                         \
           const char *sType = sourceIsI64 ? "integer64" : type2char(TYPEOF(source));                                        \
           const char *tType = targetIsI64 ? "integer64" : type2char(TYPEOF(target));                                        \
-          snprintf(memrecycle_message, MSGSIZE, FMT,                                                                        \
+          snprintf(memrecycle_message, sizeof(memrecycle_message), FMT,                                                     \
             FMTVAL, sType, i+1, tType,                                                                                      \
             /* NB: important for () to be part of the translated string as a signal of nominative case to translators */    \
             colnum == 0 ? _("(target vector)") : columnDesc(colnum, colname));                                              \
diff --git a/src/fmelt.c b/src/fmelt.c
@@ -612,7 +612,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str
         for (int j=0, ansloc=0, level=1; j<data->lmax; ++j) {
           const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow;
           char buff[20];
-          snprintf(buff, 20, "%d", level++); // # notranslate
+          snprintf(buff, sizeof(buff), "%d", level++); // # notranslate
           for (int k=0; k<thislen; ++k) SET_STRING_ELT(target, ansloc++, mkChar(buff));
         }
       }
@@ -649,7 +649,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str
         for (int j=0, ansloc=0; j<data->lmax; ++j) {
           const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow;
           char buff[20];
-          snprintf(buff, 20, "%d", nlevel+1); // # notranslate
+          snprintf(buff, sizeof(buff), "%d", nlevel + 1); // # notranslate
           SET_STRING_ELT(levels, nlevel++, mkChar(buff));  // generate levels = 1:nlevels
           for (int k=0; k<thislen; ++k) td[ansloc++] = nlevel;
         }
diff --git a/src/forder.c b/src/forder.c
@@ -99,17 +99,19 @@ static void cleanup(void) {
   savetl_end();  // Restore R's own usage of tl. Must run after the for loop in free_ustr() since only CHARSXP which had tl>0 (R's usage) are stored there.
 }
 
+// # nocov start
 void internal_error_with_cleanup(const char *call_name, const char *format, ...) {
   char buff[1024];
   va_list args;
   va_start(args, format);
 
-  vsnprintf(buff, 1023, format, args);
+  vsnprintf(buff, sizeof(buff), format, args);
   va_end(args);
 
   cleanup();
   error("%s %s: %s. %s", _("Internal error in"), call_name, buff, _("Please report to the data.table issues tracker."));
 }
+// # nocov end
 
 static void push(const int *x, const int n) {
   if (!retgrp) return;  // clearer to have the switch here rather than before each call
diff --git a/src/freadR.c b/src/freadR.c
@@ -233,9 +233,9 @@ static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) {
   for (int j=0; j<n; ++j) {
     int k = itemsD[j];
     if (k==NA_INTEGER || k<1 || k>ncol) {
-      static char buff[51];
-      if (dropSource==-1) snprintf(buff, 50, "drop[%d]", j+1); // # notranslate
-      else snprintf(buff, 50, "colClasses[[%d]][%d]", dropSource+1, j+1); // # notranslate
+      static char buff[50];
+      if (dropSource==-1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate
+      else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate
       if (k==NA_INTEGER) {
         if (isString(items))
           DTWARN(_("Column name '%s' (%s) not found"), CHAR(STRING_ELT(items, j)), buff);
@@ -264,7 +264,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
     SEXP elem;
     if (colNames==NULL || colNames[i].len<=0) {
       char buff[12];
-      snprintf(buff,12,"V%d",i+1); // # notranslate
+      snprintf(buff, sizeof(buff), "V%d", i + 1); // # notranslate
       elem = mkChar(buff);  // no PROTECT as passed immediately to SET_STRING_ELT
     } else {
       elem = mkCharLenCE(anchor+colNames[i].off, colNames[i].len, ienc);  // no PROTECT as passed immediately to SET_STRING_ELT
@@ -716,7 +716,7 @@ void halt__(bool warn, const char *format, ...) {
   va_list args;
   va_start(args, format);
   char msg[2000];
-  vsnprintf(msg, 2000, format, args);
+  vsnprintf(msg, sizeof(msg), format, args);
   va_end(args);
   freadCleanup(); // this closes mmp hence why we just copied substrings from mmp to msg[] first since mmp is now invalid
   // if (warn) warning(_("%s"), msg);
diff --git a/src/freadR.h b/src/freadR.h
@@ -21,8 +21,8 @@
 // Where no halt is happening, we can just use raw Rprintf() or warning()
 void halt__(bool warn, const char *format, ...);   // see freadR.c
 #define STOP(...)   halt__(0, __VA_ARGS__)
-static char internal_error_buff[1001] __attribute__((unused)); // match internalErrSize // todo: fix imports such that compiler warns correctly #6468
-#define INTERNAL_STOP(...) do {snprintf(internal_error_buff, 1000, __VA_ARGS__); halt__(0, "%s %s: %s. %s", _("Internal error in"), __func__, internal_error_buff, _("Please report to the data.table issues tracker"));} while (0)
+static char internal_error_buff[1000] __attribute__((unused)); // match internalErrSize // todo: fix imports such that compiler warns correctly #6468
+#define INTERNAL_STOP(...) do {snprintf(internal_error_buff, sizeof(internal_error_buff), __VA_ARGS__); halt__(0, "%s %s: %s. %s", _("Internal error in"), __func__, internal_error_buff, _("Please report to the data.table issues tracker"));} while (0)
 #define DTPRINT     Rprintf
 #define DTWARN(...) warningsAreErrors ? halt__(1, __VA_ARGS__) : warning(__VA_ARGS__)
 
diff --git a/src/fwrite.h b/src/fwrite.h
@@ -10,7 +10,7 @@
   #define STOP     error
   #define DTPRINT  Rprintf
   static char internal_error_buff[256] __attribute__((unused)); // todo: fix imports such that compiler warns correctly #6468
-  #define INTERNAL_STOP(...) do {snprintf(internal_error_buff, 255, __VA_ARGS__); error("%s %s: %s. %s", _("Internal error in"), __func__, internal_error_buff, _("Please report to the data.table issues tracker"));} while (0)
+  #define INTERNAL_STOP(...) do {snprintf(internal_error_buff, sizeof(internal_error_buff), __VA_ARGS__); error("%s %s: %s. %s", _("Internal error in"), __func__, internal_error_buff, _("Please report to the data.table issues tracker"));} while (0)
 #endif
 
 typedef void writer_fun_t(const void *, int64_t, char **);
diff --git a/src/rbindlist.c b/src/rbindlist.c
@@ -197,7 +197,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
   if (!fill && (usenames==TRUE || usenames==NA_LOGICAL)) {
     // Ensure no missings in both cases, and (when usenames==NA) all columns in same order too
     // We proceeded earlier as if fill was true, so varying ncol items will have missing here
-    char buff[1001] = "";
+    char buff[1000] = "";
     const char *extra = usenames==TRUE?"":_(" use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for "
                                             " backwards compatibility. See news item 5 in v1.12.2 for options to control this message.");
     for (int i=0; i<LENGTH(l); ++i) {
@@ -212,7 +212,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
           SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
           int w2 = colMap[i*ncol + j];
           const char *str = isString(s) ? CHAR(STRING_ELT(s,w2)) : "";
-          snprintf(buff, 1000, _("Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s"),
+          snprintf(buff, sizeof(buff), _("Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s"),
                         w2+1, str, i+1, missi+1, extra );
           if (usenames==TRUE) error("%s", buff); // # notranslate
           i = LENGTH(l); // break from outer i loop
@@ -221,7 +221,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
         if (w!=j && usenames==NA_LOGICAL) {
           SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
           if (!isString(s) || i==0) internal_error(__func__, "usenames==NA but an out-of-order name has been found in an item with no names or the first item. [%d]", i); // # nocov
-          snprintf(buff, 1000, _("Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE to match by column name, or use.names=FALSE to ignore column names.%s"),
+          snprintf(buff, sizeof(buff), _("Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE to match by column name, or use.names=FALSE to ignore column names.%s"),
                                w+1, CHAR(STRING_ELT(s,w)), i+1, j+1, i, extra);
           i = LENGTH(l);
           break;
@@ -336,7 +336,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
       }
     }
 
-    if (!foundName) { static char buff[12]; snprintf(buff,12,"V%d",j+1), SET_STRING_ELT(ansNames, idcol+j, mkChar(buff)); foundName=buff; } // # notranslate
+    if (!foundName) { static char buff[12]; snprintf(buff, sizeof(buff), "V%d", j + 1), SET_STRING_ELT(ansNames, idcol + j, mkChar(buff)); foundName = buff; } // # notranslate
     if (factor) maxType=INTSXP;  // if any items are factors then a factor is created (could be an option)
     if (int64 && !(maxType==REALSXP || maxType==STRSXP || maxType==VECSXP || maxType==CPLXSXP))
       internal_error(__func__, "column %d of result is determined to be integer64 but maxType=='%s' != REALSXP", j+1, type2char(maxType)); // # nocov
@@ -402,12 +402,12 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
               const int tl = TRUELENGTH(s);
               if (tl>=last) {  // if tl>=0 then also tl>=last because last<=0
                 if (tl>=0) {
-                  snprintf(warnStr, 1000,   // not direct warning as we're inside tl region
+                  snprintf(warnStr, sizeof(warnStr),   // not direct warning as we're inside tl region
                   _("Column %d of item %d is an ordered factor but level %d ['%s'] is missing from the ordered levels from column %d of item %d. " \
                     "Each set of ordered factor levels should be an ordered subset of the first longest. A regular factor will be created for this column."),
                   w+1, i+1, k+1, CHAR(s), longestW+1, longestI+1);
                 } else {
-                  snprintf(warnStr, 1000,
+                  snprintf(warnStr, sizeof(warnStr),
                   _("Column %d of item %d is an ordered factor with '%s'<'%s' in its levels. But '%s'<'%s' in the ordered levels from column %d of item %d. " \
                     "A regular factor will be created for this column due to this ambiguity."),
                   w+1, i+1, CHAR(levelsD[k-1]), CHAR(s), CHAR(s), CHAR(levelsD[k-1]), longestW+1, longestI+1);
diff --git a/src/utils.c b/src/utils.c
diff --git a/src/wrappers.c b/src/wrappers.c

Original file line number	Diff line number	Diff line change
`@@ -612,7 +612,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str`
`612`	`612`	`for (int j=0, ansloc=0, level=1; j<data->lmax; ++j) {`
`613`	`613`	`const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow;`
`614`	`614`	`char buff[20];`
`615`		`- snprintf(buff, 20, "%d", level++); // # notranslate`
	`615`	`+ snprintf(buff, sizeof(buff), "%d", level++); // # notranslate`
`616`	`616`	`for (int k=0; k<thislen; ++k) SET_STRING_ELT(target, ansloc++, mkChar(buff));`
`617`	`617`	`}`
`618`	`618`	`}`
`@@ -649,7 +649,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str`
`649`	`649`	`for (int j=0, ansloc=0; j<data->lmax; ++j) {`
`650`	`650`	`const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow;`
`651`	`651`	`char buff[20];`
`652`		`- snprintf(buff, 20, "%d", nlevel+1); // # notranslate`
	`652`	`+ snprintf(buff, sizeof(buff), "%d", nlevel + 1); // # notranslate`
`653`	`653`	`SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels`
`654`	`654`	`for (int k=0; k<thislen; ++k) td[ansloc++] = nlevel;`
`655`	`655`	`}`