Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,8 @@

19. Ellipsis elements like `..1` are correctly excluded when searching for variables in "up-a-level" syntax inside `[`, [#5460](https://github.com/Rdatatable/data.table/issues/5460). Thanks @ggrothendieck for the report and @MichaelChirico for the fix.

20. `fread()` honors now duplicate column names when using `select=` or `drop=`, so every matching column is kept or removed consistently, [#1899](https://github.com/Rdatatable/data.table/issues/1899). Thanks @MichaelChirico for the report and @ben-schwen for the fix.

### NOTES

1. The following in-progress deprecations have proceeded:
Expand Down
5 changes: 5 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21685,3 +21685,8 @@ d3 = unserialize(serialize(d2, NULL))
test(2340.05, .selfref.ok(d3), FALSE)
setDT(d3)
test(2340.06, .selfref.ok(d3), TRUE)

# fread select= and drop= with duplicated column names #6729
test(2341.1, fread("x1,x1,y\n2,3,4", select="x1"), data.table(x1=2L, x1=3L))
test(2342.2, fread("x1,x1,y\n2,3,4", drop="x1"), data.table(y=4L))
test(2342.3, fread("x1,x1,y\n2,3,4", drop=NA_character_), data.table(x1=2L, x1=3L, y=4L), warning="drop.*is NA")
60 changes: 50 additions & 10 deletions src/freadR.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,23 +231,44 @@ SEXP freadR(
static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource)
{
if (!length(items)) return;
const SEXP itemsInt = PROTECT(isString(items) ? chmatch(items, colNamesSxp, NA_INTEGER) : coerceVector(items, INTSXP));
const int* const itemsD = INTEGER(itemsInt);
if (isString(items)) {
const int nItems = LENGTH(items);
SEXP matches = PROTECT(chmatch(colNamesSxp, items, NA_INTEGER));
const int *matchD = INTEGER(matches);
bool *found = (bool *)R_alloc((size_t)nItems, sizeof(*found)); // remembers which drop entries matched at least once
memset(found, 0, (size_t)nItems * sizeof(*found));
for (int col = 0; col < ncol; col++) {
const int idx = matchD[col];
if (idx == NA_INTEGER) continue;
type[col] = CT_DROP;
found[idx - 1] = true;
}
for (int j = 0; j < nItems; j++) {
char buff[50];
if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate
else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate
SEXP thisItem = STRING_ELT(items, j);
if (thisItem == NA_STRING) {
DTWARN(_("%s is NA"), buff);
continue;
}
if (!found[j])
DTWARN(_("Column name '%s' (%s) not found"), CHAR(thisItem), buff);
}
UNPROTECT(1);
return;
}
SEXP itemsInt = PROTECT(coerceVector(items, INTSXP));
const int *itemsD = INTEGER(itemsInt);
const int n = LENGTH(itemsInt);
for (int j = 0; j < n; j++) {
const int k = itemsD[j];
if (k == NA_INTEGER || k < 1 || k > ncol) {
char buff[50];
if (dropSource == -1) snprintf(buff, sizeof(buff), "drop[%d]", j + 1); // # notranslate
else snprintf(buff, sizeof(buff), "colClasses[[%d]][%d]", dropSource + 1, j + 1); // # notranslate
if (k == NA_INTEGER) {
if (isString(items))
DTWARN(_("Column name '%s' (%s) not found"), CHAR(STRING_ELT(items, j)), buff);
else
DTWARN(_("%s is NA"), buff);
} else {
DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol);
}
if (k == NA_INTEGER) DTWARN(_("%s is NA"), buff);
else DTWARN(_("%s is %d which is out of range [1,ncol=%d]"), buff, k, ncol);
} else {
type[k - 1] = CT_DROP;
// aside: dropping the same column several times is acceptable with no warning. This could arise via duplicates in the drop= vector,
Expand Down Expand Up @@ -300,6 +321,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
}
SET_VECTOR_ELT(RCHK, 3, selectRank = allocVector(INTSXP, ncol));
int *selectRankD = INTEGER(selectRank), rank = 1;
for (int i = 0; i < ncol; i++) selectRankD[i] = 0;
for (int i = 0; i < n; i++) {
const int k = selectInts[i];
if (k == NA_INTEGER) continue; // missing column name warned above and skipped
Expand All @@ -310,6 +332,24 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
type[k - 1] *= -1; // detect and error on duplicates on all types without calling duplicated() at all
selectRankD[k - 1] = rank++; // rank not i to skip missing column names
}
if (isString(selectSxp)) {
for (int i = 0; i < n; i++) {
const int k = selectInts[i];
if (k == NA_INTEGER) continue; // skip NA matches
const SEXP targetS = STRING_ELT(selectSxp, i);
if (targetS == NA_STRING) continue; // skip empty names
const char *target = CHAR(targetS);
for (int col = 0; col < ncol; col++) {
if (selectRankD[col]) continue; // already marked by first match (including duplicates handled earlier)
SEXP nameS = STRING_ELT(colNamesSxp, col);
if (nameS == NA_STRING) continue;
if (strcmp(CHAR(nameS), target) == 0) {
type[col] *= -1; // flip type to flag
selectRankD[col] = rank++;
}
}
}
}
for (int i = 0; i < ncol; i++) {
if (type[i] < 0) type[i] *= -1;
else type[i] = CT_DROP;
Expand Down
Loading