Skip to content

Commit 2102755

Browse files
committed
fread can now select= and drop= with duplicated column names
1 parent 55b0de6 commit 2102755

File tree

3 files changed

+56
-10
lines changed

3 files changed

+56
-10
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,8 @@
332332
333333
19. Ellipsis elements like `..1` are correctly excluded when searching for variables in "up-a-level" syntax inside `[`, [#5460](https://github.com/Rdatatable/data.table/issues/5460). Thanks @ggrothendieck for the report and @MichaelChirico for the fix.
334334
335+
20. `fread()` honors now duplicate column names when using `select=` or `drop=`, so every matching column is kept or removed consistently, [#1899](https://github.com/Rdatatable/data.table/issues/1899). Thanks @MichaelChirico for the report and @ben-schwen for the fix.
336+
335337
### NOTES
336338
337339
1. The following in-progress deprecations have proceeded:

inst/tests/tests.Rraw

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21685,3 +21685,7 @@ d3 = unserialize(serialize(d2, NULL))
2168521685
test(2340.05, .selfref.ok(d3), FALSE)
2168621686
setDT(d3)
2168721687
test(2340.06, .selfref.ok(d3), TRUE)
21688+
21689+
# fread select= and drop= with duplicated column names #6729
21690+
test(2341.1, fread("x1,x1,y\n2,3,4", select="x1"), data.table(x1=2L, x1=3L))
21691+
test(2342.2, fread("x1,x1,y\n2,3,4", drop="x1"), data.table(y=4L))

src/freadR.c

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -231,23 +231,44 @@ SEXP freadR(
231231
static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource)
232232
{
233233
if (!length(items)) return;
234-
const SEXP itemsInt = PROTECT(isString(items) ? chmatch(items, colNamesSxp, NA_INTEGER) : coerceVector(items, INTSXP));
235-
const int* const itemsD = INTEGER(itemsInt);
234+
if (isString(items)) {
235+
const int nItems = LENGTH(items);
236+
SEXP matches = PROTECT(chmatch(colNamesSxp, items, NA_INTEGER));
237+
const int *matchD = INTEGER(matches);
238+
bool *found = (bool *)R_alloc((size_t)nItems, sizeof(*found)); // remembers which drop entries matched at least once
239+
memset(found, 0, (size_t)nItems * sizeof(*found));
240+
for (int col = 0; col < ncol; col++) {
241+
const int idx = matchD[col];
242+
if (idx == NA_INTEGER) continue;
243+
type[col] = CT_DROP;
244+
found[idx - 1] = true;
245+
}
246+
for (int j = 0; j < nItems; j++) {
247+
char buff[50];
248+
if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate
249+
else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate
250+
SEXP thisItem = STRING_ELT(items, j);
251+
if (thisItem == NA_STRING) {
252+
DTWARN(_("%s is NA"), buff);
253+
continue;
254+
}
255+
if (!found[j])
256+
DTWARN(_("Column name '%s' (%s) not found"), CHAR(thisItem), buff);
257+
}
258+
UNPROTECT(1);
259+
return;
260+
}
261+
SEXP itemsInt = PROTECT(coerceVector(items, INTSXP));
262+
const int *itemsD = INTEGER(itemsInt);
236263
const int n = LENGTH(itemsInt);
237264
for (int j = 0; j < n; j++) {
238265
const int k = itemsD[j];
239266
if (k == NA_INTEGER || k < 1 || k > ncol) {
240267
char buff[50];
241268
if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate
242269
else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate
243-
if (k == NA_INTEGER) {
244-
if (isString(items))
245-
DTWARN(_("Column name '%s' (%s) not found"), CHAR(STRING_ELT(items, j)), buff);
246-
else
247-
DTWARN(_("%s is NA"), buff);
248-
} else {
249-
DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol);
250-
}
270+
if (k == NA_INTEGER) DTWARN(_("%s is NA"), buff);
271+
else DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol);
251272
} else {
252273
type[k - 1] = CT_DROP;
253274
// aside: dropping the same column several times is acceptable with no warning. This could arise via duplicates in the drop= vector,
@@ -300,6 +321,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
300321
}
301322
SET_VECTOR_ELT(RCHK, 3, selectRank = allocVector(INTSXP, ncol));
302323
int *selectRankD = INTEGER(selectRank), rank = 1;
324+
for (int i = 0; i < ncol; i++) selectRankD[i] = 0;
303325
for (int i = 0; i < n; i++) {
304326
const int k = selectInts[i];
305327
if (k == NA_INTEGER) continue; // missing column name warned above and skipped
@@ -310,6 +332,24 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
310332
type[k - 1] *= -1; // detect and error on duplicates on all types without calling duplicated() at all
311333
selectRankD[k - 1] = rank++; // rank not i to skip missing column names
312334
}
335+
if (isString(selectSxp)) {
336+
for (int i = 0; i < n; i++) {
337+
const int k = selectInts[i];
338+
if (k == NA_INTEGER) continue; // skip NA matches
339+
const SEXP targetS = STRING_ELT(selectSxp, i);
340+
if (targetS == NA_STRING) continue; // skip empty names
341+
const char *target = CHAR(targetS);
342+
for (int col = 0; col < ncol; col++) {
343+
if (selectRankD[col]) continue; // already marked by first match (including duplicates handled earlier)
344+
SEXP nameS = STRING_ELT(colNamesSxp, col);
345+
if (nameS == NA_STRING) continue;
346+
if (strcmp(CHAR(nameS), target) == 0) {
347+
type[col] *= -1; // flip type to flag
348+
selectRankD[col] = rank++;
349+
}
350+
}
351+
}
352+
}
313353
for (int i = 0; i < ncol; i++) {
314354
if (type[i] < 0) type[i] *= -1;
315355
else type[i] = CT_DROP;

0 commit comments

Comments
 (0)