diff --git a/NEWS.md b/NEWS.md index 1ef9396f7..baea2d8b5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -332,6 +332,8 @@ 19. Ellipsis elements like `..1` are correctly excluded when searching for variables in "up-a-level" syntax inside `[`, [#5460](https://github.com/Rdatatable/data.table/issues/5460). Thanks @ggrothendieck for the report and @MichaelChirico for the fix. +20. `fread()` honors now duplicate column names when using `select=` or `drop=`, so every matching column is kept or removed consistently, [#1899](https://github.com/Rdatatable/data.table/issues/1899). Thanks @MichaelChirico for the report and @ben-schwen for the fix. + ### NOTES 1. The following in-progress deprecations have proceeded: diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b2142520d..94a0a67ea 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21685,3 +21685,8 @@ d3 = unserialize(serialize(d2, NULL)) test(2340.05, .selfref.ok(d3), FALSE) setDT(d3) test(2340.06, .selfref.ok(d3), TRUE) + +# fread select= and drop= with duplicated column names #6729 +test(2341.1, fread("x1,x1,y\n2,3,4", select="x1"), data.table(x1=2L, x1=3L)) +test(2342.2, fread("x1,x1,y\n2,3,4", drop="x1"), data.table(y=4L)) +test(2342.3, fread("x1,x1,y\n2,3,4", drop=NA_character_), data.table(x1=2L, x1=3L, y=4L), warning="drop.*is NA") diff --git a/src/freadR.c b/src/freadR.c index 11bf02759..ba2cb2ff6 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -231,8 +231,35 @@ SEXP freadR( static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) { if (!length(items)) return; - const SEXP itemsInt = PROTECT(isString(items) ? chmatch(items, colNamesSxp, NA_INTEGER) : coerceVector(items, INTSXP)); - const int* const itemsD = INTEGER(itemsInt); + if (isString(items)) { + const int nItems = LENGTH(items); + SEXP matches = PROTECT(chmatch(colNamesSxp, items, NA_INTEGER)); + const int *matchD = INTEGER(matches); + bool *found = (bool *)R_alloc((size_t)nItems, sizeof(*found)); // remembers which drop entries matched at least once + memset(found, 0, (size_t)nItems * sizeof(*found)); + for (int col = 0; col < ncol; col++) { + const int idx = matchD[col]; + if (idx == NA_INTEGER) continue; + type[col] = CT_DROP; + found[idx - 1] = true; + } + for (int j = 0; j < nItems; j++) { + char buff[50]; + if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate + else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate + SEXP thisItem = STRING_ELT(items, j); + if (thisItem == NA_STRING) { + DTWARN(_("%s is NA"), buff); + continue; + } + if (!found[j]) + DTWARN(_("Column name '%s' (%s) not found"), CHAR(thisItem), buff); + } + UNPROTECT(1); + return; + } + SEXP itemsInt = PROTECT(coerceVector(items, INTSXP)); + const int *itemsD = INTEGER(itemsInt); const int n = LENGTH(itemsInt); for (int j = 0; j < n; j++) { const int k = itemsD[j]; @@ -240,14 +267,8 @@ static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource) char buff[50]; if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate - if (k == NA_INTEGER) { - if (isString(items)) - DTWARN(_("Column name '%s' (%s) not found"), CHAR(STRING_ELT(items, j)), buff); - else - DTWARN(_("%s is NA"), buff); - } else { - DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol); - } + if (k == NA_INTEGER) DTWARN(_("%s is NA"), buff); + else DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol); } else { type[k - 1] = CT_DROP; // aside: dropping the same column several times is acceptable with no warning. This could arise via duplicates in the drop= vector, @@ -300,6 +321,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int } SET_VECTOR_ELT(RCHK, 3, selectRank = allocVector(INTSXP, ncol)); int *selectRankD = INTEGER(selectRank), rank = 1; + for (int i = 0; i < ncol; i++) selectRankD[i] = 0; for (int i = 0; i < n; i++) { const int k = selectInts[i]; if (k == NA_INTEGER) continue; // missing column name warned above and skipped @@ -310,6 +332,24 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int type[k - 1] *= -1; // detect and error on duplicates on all types without calling duplicated() at all selectRankD[k - 1] = rank++; // rank not i to skip missing column names } + if (isString(selectSxp)) { + for (int i = 0; i < n; i++) { + const int k = selectInts[i]; + if (k == NA_INTEGER) continue; // skip NA matches + const SEXP targetS = STRING_ELT(selectSxp, i); + if (targetS == NA_STRING) continue; // skip empty names + const char *target = CHAR(targetS); + for (int col = 0; col < ncol; col++) { + if (selectRankD[col]) continue; // already marked by first match (including duplicates handled earlier) + SEXP nameS = STRING_ELT(colNamesSxp, col); + if (nameS == NA_STRING) continue; + if (strcmp(CHAR(nameS), target) == 0) { + type[col] *= -1; // flip type to flag + selectRankD[col] = rank++; + } + } + } + } for (int i = 0; i < ncol; i++) { if (type[i] < 0) type[i] *= -1; else type[i] = CT_DROP;