Skip to content

Commit dda8722

Browse files
Merge remote-tracking branch 'origin/master' into merge_factor_char_key
2 parents 088f876 + 08d0b17 commit dda8722

File tree

7 files changed

+80
-43
lines changed

7 files changed

+80
-43
lines changed

NEWS.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@
7070
7171
14. `data.table()` function is now more aligned with `data.frame()` with respect to the names of the output when one of its inputs is a single-column matrix object, [#4124](https://github.com/Rdatatable/data.table/issues/4124). Thanks @PavoDive for the report and @jangorecki for the PR.
7272
73-
15. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
73+
15. Including an `ITime` object as a named input to `data.frame()` respects the provided name, i.e. `data.frame(a = as.ITime(...))` will have column `a`, [#4673](https://github.com/Rdatatable/data.table/issues/4673). Thanks @shrektan for the report and @MichaelChirico for the fix.
74+
75+
16. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
7476
7577
### NOTES
7678

R/IDateTime.R

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ as.character.ITime = format.ITime = function(x, ...) {
209209
res
210210
}
211211

212-
as.data.frame.ITime = function(x, ...) {
212+
as.data.frame.ITime = function(x, ..., optional=FALSE) {
213213
# This method is just for ggplot2, #1713
214214
# Avoids the error "cannot coerce class '"ITime"' into a data.frame", but for some reason
215215
# ggplot2 doesn't seem to call the print method to get axis labels, so still prints integers.
@@ -219,7 +219,8 @@ as.data.frame.ITime = function(x, ...) {
219219
# ans = list(as.POSIXct(x,tzone="")) # ggplot2 gives "Error: Discrete value supplied to continuous scale"
220220
setattr(ans, "class", "data.frame")
221221
setattr(ans, "row.names", .set_row_names(length(x)))
222-
setattr(ans, "names", "V1")
222+
# require 'optional' support for passing back to e.g. data.frame() without overriding names there
223+
if (!optional) setattr(ans, "names", "V1")
223224
ans
224225
}
225226

R/data.table.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ replace_dot_alias = function(e) {
221221
}
222222
return(x)
223223
}
224-
if (!mult %chin% c("first","last","all")) stopf("mult argument can only be 'first', 'last' or 'all'")
224+
if (!mult %chin% c("first", "last", "all")) stopf("mult argument can only be 'first', 'last' or 'all'")
225225
missingroll = missing(roll)
226226
if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
227227
if (is.character(roll)) {

inst/tests/tests.Rraw

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21290,3 +21290,9 @@ test(2322.12, levels(fctr(c("b","a","c"), rev=NA)), error="TRUE or FALSE")
2129021290
test(2322.21, levels(fctr(c("b","a","c"), sort=TRUE)), c("a","b","c"))
2129121291
test(2322.22, levels(fctr(c("b","a","c"), sort=NA)), error="TRUE or FALSE")
2129221292
test(2322.31, levels(fctr(c("b","a","c"), rev=TRUE, sort=TRUE)), c("c","b","a"))
21293+
21294+
# data.frame() uses provided names of ITime inputs
21295+
it <- as.ITime('00:00:00')
21296+
test(2323.1, names(data.frame(COL = it)), "COL")
21297+
test(2323.2, names(data.frame(b = 1, COL = it)), c("b", "COL"))
21298+
test(2323.3, names(as.data.frame(it, optional=TRUE)), NULL)

src/bmerge.c

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,10 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
4949
// iArg, xArg, icolsArg and xcolsArg
5050
idtVec = SEXPPTR_RO(idt); // set globals so bmerge_r can see them.
5151
xdtVec = SEXPPTR_RO(xdt);
52-
if (!isInteger(icolsArg)) internal_error(__func__, "icols is not integer vector"); // # nocov
53-
if (!isInteger(xcolsArg)) internal_error(__func__, "xcols is not integer vector"); // # nocov
52+
if (!isInteger(icolsArg))
53+
internal_error(__func__, "icols is not integer vector"); // # nocov
54+
if (!isInteger(xcolsArg))
55+
internal_error(__func__, "xcols is not integer vector"); // # nocov
5456
if ((LENGTH(icolsArg)==0 || LENGTH(xcolsArg)==0) && LENGTH(idt)>0) // We let through LENGTH(i) == 0 for tests 2126.*
5557
internal_error(__func__, "icols and xcols must be non-empty integer vectors");
5658
if (LENGTH(icolsArg) > LENGTH(xcolsArg)) internal_error(__func__, "length(icols) [%d] > length(xcols) [%d]", LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov
@@ -60,10 +62,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
6062
iN = ilen = anslen = LENGTH(idt) ? LENGTH(VECTOR_ELT(idt,0)) : 0;
6163
ncol = LENGTH(icolsArg); // there may be more sorted columns in x than involved in the join
6264
for(int col=0; col<ncol; col++) {
63-
if (icols[col]==NA_INTEGER) internal_error(__func__, "icols[%d] is NA", col); // # nocov
64-
if (xcols[col]==NA_INTEGER) internal_error(__func__, "xcols[%d] is NA", col); // # nocov
65-
if (icols[col]>LENGTH(idt) || icols[col]<1) error(_("icols[%d]=%d outside range [1,length(i)=%d]"), col, icols[col], LENGTH(idt));
66-
if (xcols[col]>LENGTH(xdt) || xcols[col]<1) error(_("xcols[%d]=%d outside range [1,length(x)=%d]"), col, xcols[col], LENGTH(xdt));
65+
if (icols[col]==NA_INTEGER)
66+
internal_error(__func__, "icols[%d] is NA", col); // # nocov
67+
if (xcols[col]==NA_INTEGER)
68+
internal_error(__func__, "xcols[%d] is NA", col); // # nocov
69+
if (icols[col]>LENGTH(idt) || icols[col]<1)
70+
internal_error(__func__, "icols[%d]=%d outside range [1,length(i)=%d]", col, icols[col], LENGTH(idt)); // # nocov. Should have been caught already.
71+
if (xcols[col]>LENGTH(xdt) || xcols[col]<1)
72+
internal_error(__func__, "xcols[%d]=%d outside range [1,length(x)=%d]", col, xcols[col], LENGTH(xdt)); // # nocov
6773
int it = TYPEOF(VECTOR_ELT(idt, icols[col]-1));
6874
int xt = TYPEOF(VECTOR_ELT(xdt, xcols[col]-1));
6975
if (iN && it!=xt)
@@ -75,11 +81,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
7581
// rollArg, rollendsArg
7682
roll = 0.0; rollToNearest = FALSE;
7783
if (isString(rollarg)) {
78-
if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error(_("roll is character but not 'nearest'"));
79-
if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet."));
84+
if (strcmp(CHAR(STRING_ELT(rollarg, 0)), "nearest") != 0)
85+
internal_error(__func__, "roll is character but not 'nearest'"); // # nocov. Only [.data.table exposes roll= directly, and this is already checked there.
86+
if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP)
87+
error(_("roll='nearest' can't be applied to a character column, yet."));
8088
roll=1.0; rollToNearest=TRUE; // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later
8189
} else {
82-
if (!isReal(rollarg)) internal_error(__func__, "roll is not character or double"); // # nocov
90+
if (!isReal(rollarg))
91+
internal_error(__func__, "roll is not character or double"); // # nocov
8392
roll = REAL(rollarg)[0]; // more common case (rolling forwards or backwards) or no roll when 0.0
8493
}
8594
rollabs = fabs(roll);
@@ -98,10 +107,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
98107
}
99108

100109
// mult arg
101-
if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL;
102-
else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) mult = FIRST;
103-
else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) mult = LAST;
104-
else internal_error(__func__, "invalid value for 'mult'"); // # nocov
110+
if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all"))
111+
mult = ALL;
112+
else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first"))
113+
mult = FIRST;
114+
else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last"))
115+
mult = LAST;
116+
else
117+
internal_error(__func__, "invalid value for 'mult'"); // # nocov
105118

106119
// opArg
107120
if (!isInteger(opArg) || length(opArg)!=ncol)
@@ -132,7 +145,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
132145
retLength = R_Calloc(anslen, int);
133146
retIndex = R_Calloc(anslen, int);
134147
// initialise retIndex here directly, as next loop is meant for both equi and non-equi joins
135-
for (int j=0; j<anslen; j++) retIndex[j] = j+1;
148+
for (int j=0; j<anslen; j++)
149+
retIndex[j] = j+1;
136150
} else { // equi joins (or) non-equi join but no multiple matches
137151
retFirstArg = PROTECT(allocVector(INTSXP, anslen));
138152
retFirst = INTEGER(retFirstArg);
@@ -145,9 +159,11 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
145159
for (int j=0; j<anslen; j++) {
146160
// defaults need to populated here as bmerge_r may well not touch many locations, say if the last row of i is before the first row of x.
147161
retFirst[j] = nomatch; // default to no match for NA goto below
148-
// retLength[j] = 0; // TO DO: do this to save the branch below and later branches at R level to set .N to 0
149-
retLength[j] = nomatch==0 ? 0 : 1;
150162
}
163+
// retLength[j] = 0; // TO DO: do this to save the branch below and later branches at R level to set .N to 0
164+
int retLengthVal = (int)(nomatch != 0);
165+
for (int j=0; j<anslen; j++)
166+
retLength[j] = retLengthVal;
151167

152168
// allLen1Arg
153169
allLen1Arg = PROTECT(allocVector(LGLSXP, 1));
@@ -174,7 +190,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
174190
// xo arg
175191
xo = NULL;
176192
if (length(xoArg)) {
177-
if (!isInteger(xoArg)) internal_error(__func__, "xoArg is not an integer vector"); // # nocov
193+
if (!isInteger(xoArg))
194+
internal_error(__func__, "xoArg is not an integer vector"); // # nocov
178195
xo = INTEGER(xoArg);
179196
}
180197

@@ -391,10 +408,13 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisg
391408
// final two 1's are lowmax and uppmax
392409
} else {
393410
int len = xupp-xlow-1+rollLow+rollUpp; // rollLow and rollUpp cannot both be true
394-
if (mult==ALL && len>1) allLen1[0] = FALSE;
411+
if (len>1) {
412+
if (mult==ALL)
413+
allLen1[0] = FALSE; // bmerge()$allLen1
414+
}
395415
if (nqmaxgrp == 1) {
396-
const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // extra +1 for 1-based indexing at R level
397-
const int rl = (mult==ALL) ? len : 1;
416+
const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // bmerge()$starts thus extra +1 for 1-based indexing at R level
417+
const int rl = (mult==ALL) ? len : 1; // bmerge()$lens
398418
for (int j=ilow+1; j<iupp; j++) { // usually iterates once only for j=ir
399419
const int k = o ? o[j]-1 : j;
400420
retFirst[k] = rf;

src/fread.c

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ bool freadCleanup(void)
193193
static inline uint64_t umax(uint64_t a, uint64_t b) { return a > b ? a : b; }
194194
static inline uint64_t umin(uint64_t a, uint64_t b) { return a < b ? a : b; }
195195
static inline int64_t imin( int64_t a, int64_t b) { return a < b ? a : b; }
196+
static inline int iminInt( int a, int b) { return a < b ? a : b; }
196197

197198
/** Return value of `x` clamped to the range [upper, lower] */
198199
static inline int64_t clamp_i64t(int64_t x, int64_t lower, int64_t upper) {
@@ -2238,9 +2239,9 @@ int freadMain(freadMainArgs _args) {
22382239
double thRead = 0, thPush = 0; // reductions of timings within the parallel region
22392240
int max_col = 0;
22402241
char *typeBumpMsg = NULL; size_t typeBumpMsgSize = 0;
2241-
int typeCounts[NUMTYPE]; // used for verbose output; needs populating after first read and before reread (if any) -- see later comment
2242-
#define internalErrSize 1000
2243-
char internalErr[internalErrSize+1]=""; // must be compile time size: the message is generated and we can't free before STOP
2242+
int typeCounts[NUMTYPE]; // used for verbose output; needs populating after first read and before reread (if any) -- see later comment
2243+
char internalErr[256] = ""; // must be compile time size: the message is generated and we can't free before STOP
2244+
22442245
int64_t DTi = 0; // the current row number in DT that we are writing to
22452246
const char *headPos = pos; // the jump start corresponding to DTi
22462247
int nSwept = 0; // count the number of dirty jumps that were swept
@@ -2296,7 +2297,7 @@ int freadMain(freadMainArgs _args) {
22962297
nth = omp_get_num_threads();
22972298
if (me != 0) {
22982299
// # nocov start
2299-
snprintf(internalErr, internalErrSize, "Master thread is not thread 0 but thread %d.\n", me); // # notranslate
2300+
snprintf(internalErr, sizeof(internalErr), "Master thread is not thread 0 but thread %d.\n", me); // # notranslate
23002301
stopTeam = true;
23012302
// # nocov end
23022303
}
@@ -2518,18 +2519,19 @@ int freadMain(freadMainArgs _args) {
25182519
// Can't print because we're likely not master. So accumulate message and print afterwards.
25192520
if (thisType < joldType) { // thisType<0 (type-exception)
25202521
if (verbose) {
2521-
char temp[1001];
2522-
int len = snprintf(temp, 1000,
2522+
char buffer[256];
2523+
int len = snprintf(buffer, sizeof(buffer),
25232524
_("Column %d%s%.*s%s bumped from '%s' to '%s' due to <<%.*s>> on row %"PRId64"\n"),
25242525
j + 1, colNames ? " <<" : "", colNames ? (colNames[j].len) : 0, colNames ? (colNamesAnchor + colNames[j].off) : "", colNames ? ">>" : "",
25252526
typeName[IGNORE_BUMP(joldType)], typeName[IGNORE_BUMP(thisType)],
25262527
(int)(tch - fieldStart), fieldStart, (int64_t)(ctx.DTi + myNrow));
2527-
if (len > 1000) len = 1000;
2528-
if (len > 0) {
2529-
typeBumpMsg = realloc(typeBumpMsg, typeBumpMsgSize + len + 1);
2530-
strcpy(typeBumpMsg + typeBumpMsgSize, temp);
2531-
typeBumpMsgSize += len;
2532-
}
2528+
2529+
len = iminInt(len, sizeof(buffer));
2530+
2531+
typeBumpMsg = realloc(typeBumpMsg, typeBumpMsgSize + len + 1);
2532+
strcpy(typeBumpMsg + typeBumpMsgSize, buffer);
2533+
typeBumpMsgSize += len;
2534+
25332535
}
25342536
nTypeBump++;
25352537
if (joldType > 0) nTypeBumpCols++;
@@ -2570,7 +2572,7 @@ int freadMain(freadMainArgs _args) {
25702572
}
25712573
else if (headPos != thisJumpStart && nrowLimit > 0) { // do not care for dirty jumps since we do not read data and only want to know types
25722574
// # nocov start
2573-
snprintf(internalErr, internalErrSize, "invalid head position. jump=%d, headPos=%p, thisJumpStart=%p, sof=%p", jump, headPos, thisJumpStart, sof); // # notranslate
2575+
snprintf(internalErr, sizeof(internalErr), "invalid head position. jump=%d, headPos=%p, thisJumpStart=%p, sof=%p", jump, headPos, thisJumpStart, sof); // # notranslate
25742576
stopTeam = true;
25752577
// # nocov end
25762578
}

src/vecseq.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,12 @@ SEXP vecseq(SEXP x, SEXP len, SEXP clamp)
1010
// Specially for use by [.data.table after binary search. Now so specialized that for general use
1111
// bit::vecseq is recommended (Jens has coded it in C now).
1212

13-
if (!isInteger(x)) error(_("x must be an integer vector"));
14-
if (!isInteger(len)) error(_("len must be an integer vector"));
15-
if (LENGTH(x) != LENGTH(len)) error(_("x and len must be the same length"));
13+
if (!isInteger(x))
14+
error(_("x must be an integer vector")); // # nocov
15+
if (!isInteger(len))
16+
error(_("len must be an integer vector")); // # nocov
17+
if (LENGTH(x) != LENGTH(len))
18+
error(_("x and len must be the same length")); // # nocov
1619
const int *ix = INTEGER(x);
1720
const int *ilen = INTEGER(len), nlen=LENGTH(len);
1821
int reslen = 0;
@@ -22,10 +25,13 @@ SEXP vecseq(SEXP x, SEXP len, SEXP clamp)
2225
reslen += ilen[i];
2326
}
2427
if (!isNull(clamp)) {
25-
if (!isNumeric(clamp) || LENGTH(clamp)!=1) error(_("clamp must be a double vector length 1"));
28+
if (!isNumeric(clamp) || LENGTH(clamp)!=1)
29+
error(_("clamp must be a double vector length 1")); // # nocov
2630
double limit = REAL(clamp)[0];
27-
if (limit<0) error(_("clamp must be positive"));
28-
if (reslen>limit) error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit);
31+
if (limit<0)
32+
error(_("clamp must be positive")); // # nocov
33+
if (reslen>limit)
34+
error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit);
2935
}
3036
SEXP ans = PROTECT(allocVector(INTSXP, reslen));
3137
int *ians = INTEGER(ans);

0 commit comments

Comments
 (0)