Skip to content

Commit 08aa94a

Browse files
committed
fread: consider quoted na.strings in text columns
Previously, Field() only called end_NA_string() for non-quoted fields, making it impossible to set na.strings='""' and parse empty quoted strings as missing. Fixes: #6974
1 parent 8fa0ffb commit 08aa94a

File tree

2 files changed

+19
-0
lines changed

2 files changed

+19
-0
lines changed

inst/tests/tests.Rraw

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21157,3 +21157,18 @@ test(2317.6, DT1[DF1, on='a', .(d = x.a + i.d)]$d, 5)
2115721157
test(2317.7, DT1[DF2, on='a', e := i.e]$e, 5)
2115821158
test(2317.8, DT1[DF2, on='a', e2 := x.a + i.e]$e2, 6)
2115921159
test(2317.9, DT1[DF2, on='a', .(e = x.a + i.e)]$e, 6)
21160+
21161+
# allow na.strings to be quoted, #6974
21162+
f = tempfile()
21163+
DT = data.table(
21164+
"Date Example" = c("12/5/2012", NA),
21165+
"Question 1" = c("Yes", NA),
21166+
"Question 2" = c("Yes", NA),
21167+
"Site: Country" = c("Chile", "Virgin Islands, British")
21168+
)
21169+
fwrite(DT, f, na = '""')
21170+
test(2318.1, fread(f, na.strings = '""'), DT)
21171+
unlink(f)
21172+
test(2318.2,
21173+
fread('"foo","bar","baz"\n"a","b","c"', na.strings = c('"foo"', '"bar"', '"baz"'), header = FALSE),
21174+
data.table(V1 = c(NA, "a"), V2 = c(NA, "b"), V3 = c(NA, "c")))

src/fread.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,8 @@ static void Field(FieldParseContext *ctx)
525525
// the field is quoted and quotes are correctly escaped (quoteRule 0 and 1)
526526
// or the field is quoted but quotes are not escaped (quoteRule 2)
527527
// or the field is not quoted but the data contains a quote at the start (quoteRule 2 too)
528+
// What if this string signifies an NA? Will find out after we're done parsing quotes
529+
const char *field_after_NA = end_NA_string(fieldStart);
528530
fieldStart++; // step over opening quote
529531
switch(quoteRule) {
530532
case 0: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol
@@ -583,6 +585,8 @@ static void Field(FieldParseContext *ctx)
583585
if (ch==eof && quoteRule!=2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
584586
while(target->len>0 && ((ch[-1]==' ' && stripWhite) || ch[-1]=='\0')) { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST "
585587
}
588+
// Does end-of-field correspond to end-of-possible-NA?
589+
if (field_after_NA == ch) target->len = INT32_MIN;
586590
}
587591

588592
static void str_to_i32_core(const char **pch, int32_t *target, bool parse_date)

0 commit comments

Comments
 (0)