Skip to content

Commit 05a6c29

Browse files
authored
Merge branch 'master' into issue6964
2 parents c03d176 + 46816e8 commit 05a6c29

File tree

12 files changed

+119
-43
lines changed

12 files changed

+119
-43
lines changed

NEWS.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,10 @@
7878
7979
17. A data.table with a column of class `vctrs_list_of` (from package {vctrs}) prints as expected, [#5948](https://github.com/Rdatatable/data.table/issues/5948). Before, they could be printed messily, e.g. printing every entry in a nested data.frame. Thanks @jesse-smith for the report, @DavisVaughan and @r2evans for contributing, and @MichaelChirico for the PR.
8080
81-
18. Spurious warnings from internal code in `cube()`, `rollup()`, and `groupingsets()` are no longer surfaced to the caller, [#6964](https://github.com/Rdatatable/data.table/issues/6964). Thanks @ferenci-tamas for the report and @venom1204 for the fix.
81+
18. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
8282
83+
19. Spurious warnings from internal code in `cube()`, `rollup()`, and `groupingsets()` are no longer surfaced to the caller, [#6964](https://github.com/Rdatatable/data.table/issues/6964). Thanks @ferenci-tamas for the report and @venom1204 for the fix.
84+
8385
### NOTES
8486
8587
1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.

R/data.table.R

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,21 +1348,8 @@ replace_dot_alias = function(e) {
13481348
ans[icolsAns] = .Call(CsubsetDT, i, ii, icols)
13491349
ans[xcolsAns] = .Call(CsubsetDT, x, irows, xcols)
13501350
setattr(ans, "names", ansvars)
1351-
if (haskey(x)) {
1352-
keylen = which.first(!key(x) %chin% ansvars)-1L
1353-
if (is.na(keylen)) keylen = length(key(x))
1354-
len = length(rightcols)
1355-
# fix for #1268, #1704, #1766 and #1823
1356-
chk = if (len && !missing(on)) !identical(head(key(x), len), names(on)) else FALSE
1357-
if ( (keylen>len || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
1358-
keylen = if (!chk) len else 0L # fix for #1268
1359-
}
1360-
## check key on i as well!
1361-
ichk = is.data.table(i) && haskey(i) &&
1362-
identical(head(key(i), length(leftcols)), names_i[leftcols]) # i has the correct key, #3061
1363-
if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
1364-
setattr(ans,"sorted",head(key(x),keylen))
1365-
}
1351+
# NB: could be NULL
1352+
setattr(ans, "sorted", .join_result_key(x, i, ans, if (!missing(on)) names(on), ansvars, leftcols, rightcols, names_i, irows, roll))
13661353
setattr(ans, "class", class(x)) # retain class that inherits from data.table, #64
13671354
setattr(ans, "row.names", .set_row_names(length(ans[[1L]])))
13681355
setalloccol(ans)
@@ -2034,6 +2021,48 @@ replace_dot_alias = function(e) {
20342021
setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line
20352022
}
20362023

2024+
# can the specified merge of x and i be marked as sorted? return the columns for which this is true, otherwise NULL
2025+
.join_result_key <- function(x, i, ans, on_lhs, ansvars, leftcols, rightcols, names_i, irows, roll) {
2026+
x_key <- key(x)
2027+
if (is.null(x_key))
2028+
return(NULL)
2029+
2030+
key_length = which.first(!x_key %chin% ansvars) - 1L
2031+
if (is.na(key_length))
2032+
key_length = length(x_key)
2033+
2034+
rhs_length = length(rightcols)
2035+
# fix for #1268, #1704, #1766 and #1823
2036+
chk = rhs_length && !is.null(on_lhs) && !identical(head(x_key, rhs_length), on_lhs)
2037+
if ( (key_length > rhs_length || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
2038+
key_length = if (chk) 0L else rhs_length # fix for #1268
2039+
}
2040+
2041+
if (!key_length)
2042+
return(NULL)
2043+
2044+
# i has the correct key, #3061
2045+
if (identical(head(key(i), length(leftcols)), names_i[leftcols]))
2046+
return(head(x_key, key_length))
2047+
2048+
if (!.Call(CisOrderedSubset, irows, nrow(x)))
2049+
return(NULL)
2050+
2051+
# see #1010. don't set key when i has no key, but irows is ordered and !roll
2052+
if (roll && length(irows) != 1L)
2053+
return(NULL)
2054+
2055+
new_key <- head(x_key, key_length)
2056+
2057+
#5361 merging on keyed factor with character, check if resulting character is really sorted
2058+
if (identical(vapply_1c(.shallow(i, leftcols), typeof), vapply_1c(.shallow(x, rightcols), typeof)))
2059+
return(new_key)
2060+
2061+
if (!is.sorted(ans, by=new_key))
2062+
return(NULL)
2063+
new_key
2064+
}
2065+
20372066
# What's the name of the top-level call in 'j'?
20382067
# NB: earlier, we used 'as.character()' but that fails for closures/builtins (#6026).
20392068
root_name = function(jsub) if (is.call(jsub)) paste(deparse(jsub[[1L]]), collapse = " ") else ""

inst/tests/tests.Rraw

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7077,6 +7077,50 @@ test(1483.3, merge(x,y,by="country",all=TRUE), data.table(country=factor(c("US",
70777077
setkey(y)
70787078
test(1483.4, y[x], data.table(country="US", key="country"))
70797079

7080+
# 5361 merge on character and factor should only have key(x) if result is really sorted
7081+
lett = c("a", "b", "c")
7082+
rlet = c("c", "b", "a")
7083+
x = data.table(i=rlet)
7084+
y = data.table(i=factor(lett, levels=rlet), key="i")
7085+
test(1483.51, x[y, on="i"], x)
7086+
test(1483.52, y[x, on="i"], x)
7087+
test(1483.53, merge(x, y, by="i"), data.table(i=lett, key="i"))
7088+
test(1483.54, merge(y, x, by="i"), data.table(i=lett, key="i"))
7089+
x = data.table(i1=1:3, i2=rlet)
7090+
y = data.table(i1=1:3, i2=factor(lett, levels=rlet), key=c("i1", "i2"))
7091+
test(1483.55, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
7092+
test(1483.56, y[x, on=c("i1", "i2")], x)
7093+
test(1483.57, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
7094+
test(1483.58, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
7095+
7096+
x = data.table(i=rlet, key="i")
7097+
y = data.table(i=factor(lett, levels=rlet))
7098+
test(1483.61, x[y, on="i"], x)
7099+
test(1483.62, y[x, on="i"], data.table(i=lett))
7100+
test(1483.63, merge(x, y, by="i"), data.table(i=lett, key="i"))
7101+
test(1483.64, merge(y, x, by="i"), data.table(i=lett, key="i"))
7102+
x = data.table(i1=1:3, i2=rlet, key=c("i1", "i2"))
7103+
y = data.table(i1=1:3, i2=factor(lett, levels=rlet))
7104+
test(1483.65, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
7105+
test(1483.66, y[x, on=c("i1", "i2")], data.table(i1=1:3, i2=rlet))
7106+
test(1483.67, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
7107+
test(1483.68, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
7108+
7109+
x = data.table(i=rlet, a=rlet)
7110+
y = data.table(i=factor(lett, levels=rlet), b=lett, key="i")
7111+
test(1483.71, x[y, on="i"], data.table(i=rlet, a=rlet, b=rlet))
7112+
test(1483.72, y[x, on="i"], data.table(i=rlet, b=rlet, a=rlet))
7113+
test(1483.73, merge(x, y, by="i"), data.table(i=lett, a=lett, b=lett, key="i"))
7114+
test(1483.74, merge(y, x, by="i"), data.table(i=lett, b=lett, a=lett, key="i"))
7115+
7116+
some_letters <- c("c", "b", "a")
7117+
some_more_letters <- rep(c("a", "b", "c"), 2L)
7118+
dt1 <- data.table(x = some_letters, y=1:3)
7119+
dt2 <- data.table(x = factor(some_more_letters, levels=some_letters), z=1:6, key=c("x", "z"))
7120+
dt3 <- merge(dt1, dt2, by="x")
7121+
test(1483.81, key(dt3), "x")
7122+
test(1483.82, nrow(dt3[x %in% "c", ]), 2L)
7123+
70807124
# NULL items should be removed when making data.table from list, #842
70817125
# Original fix for #842 added a branch in as.data.table.list() using point()
70827126
# Then PR#3471 moved logic from data.table() into as.data.table.list() and now removes NULL items up front, so longer need for the branch

src/assign.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -735,12 +735,11 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
735735
return(dt); // needed for `*tmp*` mechanism (when := isn't used), and to return the new object after a := for compound syntax.
736736
}
737737

738-
#define MSGSIZE 1000
739-
static char memrecycle_message[MSGSIZE+1]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects
738+
static char memrecycle_message[1000]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects
740739

741740
const char *columnDesc(int colnum, const char *colname) {
742-
static char column_desc[MSGSIZE+1]; // can contain column names, hence relatively large allocation.
743-
snprintf(column_desc, MSGSIZE, _("(column %d named '%s')"), colnum, colname);
741+
static char column_desc[sizeof(memrecycle_message)]; // can contain column names, hence relatively large allocation.
742+
snprintf(column_desc, sizeof(memrecycle_message), _("(column %d named '%s')"), colnum, colname);
744743
return column_desc;
745744
}
746745

@@ -941,7 +940,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
941940
if (COND) { \
942941
const char *sType = sourceIsI64 ? "integer64" : type2char(TYPEOF(source)); \
943942
const char *tType = targetIsI64 ? "integer64" : type2char(TYPEOF(target)); \
944-
snprintf(memrecycle_message, MSGSIZE, FMT, \
943+
snprintf(memrecycle_message, sizeof(memrecycle_message), FMT, \
945944
FMTVAL, sType, i+1, tType, \
946945
/* NB: important for () to be part of the translated string as a signal of nominative case to translators */ \
947946
colnum == 0 ? _("(target vector)") : columnDesc(colnum, colname)); \

src/fmelt.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str
612612
for (int j=0, ansloc=0, level=1; j<data->lmax; ++j) {
613613
const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow;
614614
char buff[20];
615-
snprintf(buff, 20, "%d", level++); // # notranslate
615+
snprintf(buff, sizeof(buff), "%d", level++); // # notranslate
616616
for (int k=0; k<thislen; ++k) SET_STRING_ELT(target, ansloc++, mkChar(buff));
617617
}
618618
}
@@ -649,7 +649,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str
649649
for (int j=0, ansloc=0; j<data->lmax; ++j) {
650650
const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow;
651651
char buff[20];
652-
snprintf(buff, 20, "%d", nlevel+1); // # notranslate
652+
snprintf(buff, sizeof(buff), "%d", nlevel + 1); // # notranslate
653653
SET_STRING_ELT(levels, nlevel++, mkChar(buff)); // generate levels = 1:nlevels
654654
for (int k=0; k<thislen; ++k) td[ansloc++] = nlevel;
655655
}

src/forder.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,17 +99,19 @@ static void cleanup(void) {
9999
savetl_end(); // Restore R's own usage of tl. Must run after the for loop in free_ustr() since only CHARSXP which had tl>0 (R's usage) are stored there.
100100
}
101101

102+
// # nocov start
102103
void internal_error_with_cleanup(const char *call_name, const char *format, ...) {
103104
char buff[1024];
104105
va_list args;
105106
va_start(args, format);
106107

107-
vsnprintf(buff, 1023, format, args);
108+
vsnprintf(buff, sizeof(buff), format, args);
108109
va_end(args);
109110

110111
cleanup();
111112
error("%s %s: %s. %s", _("Internal error in"), call_name, buff, _("Please report to the data.table issues tracker."));
112113
}
114+
// # nocov end
113115

114116
static void push(const int *x, const int n) {
115117
if (!retgrp) return; // clearer to have the switch here rather than before each call

src/freadR.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,9 @@ static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) {
233233
for (int j=0; j<n; ++j) {
234234
int k = itemsD[j];
235235
if (k==NA_INTEGER || k<1 || k>ncol) {
236-
static char buff[51];
237-
if (dropSource==-1) snprintf(buff, 50, "drop[%d]", j+1); // # notranslate
238-
else snprintf(buff, 50, "colClasses[[%d]][%d]", dropSource+1, j+1); // # notranslate
236+
static char buff[50];
237+
if (dropSource==-1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate
238+
else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate
239239
if (k==NA_INTEGER) {
240240
if (isString(items))
241241
DTWARN(_("Column name '%s' (%s) not found"), CHAR(STRING_ELT(items, j)), buff);
@@ -264,7 +264,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
264264
SEXP elem;
265265
if (colNames==NULL || colNames[i].len<=0) {
266266
char buff[12];
267-
snprintf(buff,12,"V%d",i+1); // # notranslate
267+
snprintf(buff, sizeof(buff), "V%d", i + 1); // # notranslate
268268
elem = mkChar(buff); // no PROTECT as passed immediately to SET_STRING_ELT
269269
} else {
270270
elem = mkCharLenCE(anchor+colNames[i].off, colNames[i].len, ienc); // no PROTECT as passed immediately to SET_STRING_ELT
@@ -716,7 +716,7 @@ void halt__(bool warn, const char *format, ...) {
716716
va_list args;
717717
va_start(args, format);
718718
char msg[2000];
719-
vsnprintf(msg, 2000, format, args);
719+
vsnprintf(msg, sizeof(msg), format, args);
720720
va_end(args);
721721
freadCleanup(); // this closes mmp hence why we just copied substrings from mmp to msg[] first since mmp is now invalid
722722
// if (warn) warning(_("%s"), msg);

src/freadR.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
// Where no halt is happening, we can just use raw Rprintf() or warning()
2222
void halt__(bool warn, const char *format, ...); // see freadR.c
2323
#define STOP(...) halt__(0, __VA_ARGS__)
24-
static char internal_error_buff[1001] __attribute__((unused)); // match internalErrSize // todo: fix imports such that compiler warns correctly #6468
25-
#define INTERNAL_STOP(...) do {snprintf(internal_error_buff, 1000, __VA_ARGS__); halt__(0, "%s %s: %s. %s", _("Internal error in"), __func__, internal_error_buff, _("Please report to the data.table issues tracker"));} while (0)
24+
static char internal_error_buff[1000] __attribute__((unused)); // match internalErrSize // todo: fix imports such that compiler warns correctly #6468
25+
#define INTERNAL_STOP(...) do {snprintf(internal_error_buff, sizeof(internal_error_buff), __VA_ARGS__); halt__(0, "%s %s: %s. %s", _("Internal error in"), __func__, internal_error_buff, _("Please report to the data.table issues tracker"));} while (0)
2626
#define DTPRINT Rprintf
2727
#define DTWARN(...) warningsAreErrors ? halt__(1, __VA_ARGS__) : warning(__VA_ARGS__)
2828

src/fwrite.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#define STOP error
1111
#define DTPRINT Rprintf
1212
static char internal_error_buff[256] __attribute__((unused)); // todo: fix imports such that compiler warns correctly #6468
13-
#define INTERNAL_STOP(...) do {snprintf(internal_error_buff, 255, __VA_ARGS__); error("%s %s: %s. %s", _("Internal error in"), __func__, internal_error_buff, _("Please report to the data.table issues tracker"));} while (0)
13+
#define INTERNAL_STOP(...) do {snprintf(internal_error_buff, sizeof(internal_error_buff), __VA_ARGS__); error("%s %s: %s. %s", _("Internal error in"), __func__, internal_error_buff, _("Please report to the data.table issues tracker"));} while (0)
1414
#endif
1515

1616
typedef void writer_fun_t(const void *, int64_t, char **);

src/rbindlist.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
197197
if (!fill && (usenames==TRUE || usenames==NA_LOGICAL)) {
198198
// Ensure no missings in both cases, and (when usenames==NA) all columns in same order too
199199
// We proceeded earlier as if fill was true, so varying ncol items will have missing here
200-
char buff[1001] = "";
200+
char buff[1000] = "";
201201
const char *extra = usenames==TRUE?"":_(" use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for "
202202
" backwards compatibility. See news item 5 in v1.12.2 for options to control this message.");
203203
for (int i=0; i<LENGTH(l); ++i) {
@@ -212,7 +212,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
212212
SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
213213
int w2 = colMap[i*ncol + j];
214214
const char *str = isString(s) ? CHAR(STRING_ELT(s,w2)) : "";
215-
snprintf(buff, 1000, _("Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s"),
215+
snprintf(buff, sizeof(buff), _("Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s"),
216216
w2+1, str, i+1, missi+1, extra );
217217
if (usenames==TRUE) error("%s", buff); // # notranslate
218218
i = LENGTH(l); // break from outer i loop
@@ -221,7 +221,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
221221
if (w!=j && usenames==NA_LOGICAL) {
222222
SEXP s = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
223223
if (!isString(s) || i==0) internal_error(__func__, "usenames==NA but an out-of-order name has been found in an item with no names or the first item. [%d]", i); // # nocov
224-
snprintf(buff, 1000, _("Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE to match by column name, or use.names=FALSE to ignore column names.%s"),
224+
snprintf(buff, sizeof(buff), _("Column %d ['%s'] of item %d appears in position %d in item %d. Set use.names=TRUE to match by column name, or use.names=FALSE to ignore column names.%s"),
225225
w+1, CHAR(STRING_ELT(s,w)), i+1, j+1, i, extra);
226226
i = LENGTH(l);
227227
break;
@@ -336,7 +336,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
336336
}
337337
}
338338

339-
if (!foundName) { static char buff[12]; snprintf(buff,12,"V%d",j+1), SET_STRING_ELT(ansNames, idcol+j, mkChar(buff)); foundName=buff; } // # notranslate
339+
if (!foundName) { static char buff[12]; snprintf(buff, sizeof(buff), "V%d", j + 1), SET_STRING_ELT(ansNames, idcol + j, mkChar(buff)); foundName = buff; } // # notranslate
340340
if (factor) maxType=INTSXP; // if any items are factors then a factor is created (could be an option)
341341
if (int64 && !(maxType==REALSXP || maxType==STRSXP || maxType==VECSXP || maxType==CPLXSXP))
342342
internal_error(__func__, "column %d of result is determined to be integer64 but maxType=='%s' != REALSXP", j+1, type2char(maxType)); // # nocov
@@ -402,12 +402,12 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
402402
const int tl = TRUELENGTH(s);
403403
if (tl>=last) { // if tl>=0 then also tl>=last because last<=0
404404
if (tl>=0) {
405-
snprintf(warnStr, 1000, // not direct warning as we're inside tl region
405+
snprintf(warnStr, sizeof(warnStr), // not direct warning as we're inside tl region
406406
_("Column %d of item %d is an ordered factor but level %d ['%s'] is missing from the ordered levels from column %d of item %d. " \
407407
"Each set of ordered factor levels should be an ordered subset of the first longest. A regular factor will be created for this column."),
408408
w+1, i+1, k+1, CHAR(s), longestW+1, longestI+1);
409409
} else {
410-
snprintf(warnStr, 1000,
410+
snprintf(warnStr, sizeof(warnStr),
411411
_("Column %d of item %d is an ordered factor with '%s'<'%s' in its levels. But '%s'<'%s' in the ordered levels from column %d of item %d. " \
412412
"A regular factor will be created for this column due to this ambiguity."),
413413
w+1, i+1, CHAR(levelsD[k-1]), CHAR(s), CHAR(s), CHAR(levelsD[k-1]), longestW+1, longestI+1);

0 commit comments

Comments
 (0)