Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 62 additions & 49 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,33 @@ static char quote, dec;
static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.'
static bool eol_one_r; // only true very rarely for \r-only files

// Quote rule:
// 0 = Fields may be quoted, any quote inside the field is doubled. This is
// the CSV standard. For example: <<...,"hello ""world""",...>>
// 1 = Fields may be quoted, any quotes inside are escaped with a backslash.
// For example: <<...,"hello \"world\"",...>>
// 2 = Fields may be quoted, but any quotes inside will appear verbatim and
// not escaped in any way. It is not always possible to parse the file
// unambiguously, but we give it a try anyways. A quote will be presumed
// to mark the end of the field iff it is followed by the field separator.
// Under this rule eol characters cannot appear inside the field.
// For example: <<...,"hello "world"",...>>
// 3 = Fields are not quoted at all. Any quote characters appearing anywhere
// inside the field will be treated as any other regular characters.
// Example: <<...,hello "world",...>>
//
static int quoteRule;
enum quote_rule_t
{
// Fields may be quoted, any quote inside the field is doubled.This is
// the CSV standard. For example: <<...,"hello ""world""",...>>
QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED,

// Fields may be quoted, any quotes inside are escaped with a backslash.
// For example: <<...,"hello \"world\"",...>>
QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED,

// Fields may be quoted, but any quotes inside will appear verbatim and
// not escaped in any way. It is not always possible to parse the file
// unambiguously, but we give it a try anyways. A quote will be presumed
// to mark the end of the field iff it is followed by the field separator.
// Under this rule eol characters cannot appear inside the field.
// For example: <<...,"hello "world"",...>>
QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED,

// Fields are not quoted at all. Any quote characters appearing anywhere
// inside the field will be treated as any other regular characters.
// Example: <<...,hello "world",...>>
QUOTE_RULE_IGNORE_QUOTES,

QUOTE_RULE_COUNT
};

static enum quote_rule_t quoteRule;
static const char* const* NAstrings;
static bool any_number_like_NAstrings = false;
static bool blank_is_a_NAstring = false;
Expand All @@ -75,7 +86,7 @@ static freadMainArgs args = { 0 }; // global for use by DTPRINT; static implies

// See header for more explanation.
const char typeName[NUMTYPE][10] = { "drop", "bool8", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string" };
int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 };
int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8, 8 };

// In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043
// So we assign them through below init_const_literals function.
Expand Down Expand Up @@ -200,7 +211,7 @@ static inline int iminInt( int a, int b) { return a < b ? a : b; }
/** Return value of `x` clamped to the range [upper, lower] */
static inline int64_t clamp_i64t(int64_t x, int64_t lower, int64_t upper)
{
return x < lower ? lower : x > upper? upper : x;
return x < lower ? lower : x > upper ? upper : x;
}


Expand Down Expand Up @@ -449,7 +460,7 @@ double copyFile(size_t fileSize) // only called in very very rare cases
return -1.0; // # nocov
memcpy(mmp_copy, mmp, fileSize);
sof = mmp_copy;
eof = (char *)OFFSET_POINTER(mmp_copy, fileSize);
eof = (char*)OFFSET_POINTER(mmp_copy, fileSize);
return wallclock() - tt;
}

Expand Down Expand Up @@ -504,7 +515,7 @@ static void Field(FieldParseContext *ctx)
if ((*ch == ' ' && stripWhite) || (*ch == '\0' && ch < eof))
while(*++ch == ' ' || (*ch == '\0' && ch < eof)); // if sep==' ' the space would have been skipped already and we wouldn't be on space now.
const char *fieldStart = ch;
if (*ch != quote || quoteRule == 3 || quote == '\0') {
if (*ch != quote || quoteRule == QUOTE_RULE_IGNORE_QUOTES || quote == '\0') {
// Most common case. Unambiguously not quoted. Simply search for sep|eol. If field contains sep|eol then it should have been quoted and we do not try to heal that.
while(!end_of_field(ch)) ch++; // sep, \r, \n or eof will end
*ctx->ch = ch;
Expand All @@ -519,28 +530,28 @@ static void Field(FieldParseContext *ctx)
return;
}
// else *ch==quote (we don't mind that quoted fields are a little slower e.g. no desire to save switch)
// the field is quoted and quotes are correctly escaped (quoteRule 0 and 1)
// or the field is quoted but quotes are not escaped (quoteRule 2)
// or the field is not quoted but the data contains a quote at the start (quoteRule 2 too)
// the field is quoted and quotes are correctly escaped (QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED and QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED)
// or the field is quoted but quotes are not escaped (QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED)
// or the field is not quoted but the data contains a quote at the start (QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED too)
// What if this string signifies an NA? Will find out after we're done parsing quotes
const char *field_after_NA = end_NA_string(fieldStart);
fieldStart++; // step over opening quote
switch(quoteRule) {
case 0: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol
case QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol
while (*++ch || ch < eof) {
if (*ch == quote) {
if (ch[1] == quote) { ch++; continue; }
break; // found undoubled closing quote
}
}
break;
case 1: // quoted with embedded quotes escaped; the final unescaped " must be followed by sep|eol
case QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED: // quoted with embedded quotes escaped; the final unescaped " must be followed by sep|eol
while (*++ch || ch < eof) {
if (*ch == '\\' && (ch[1] == quote || ch[1] == '\\')) { ch++; continue; }
if (*ch == quote) break;
}
break;
case 2:
case QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED:
// (i) quoted (perhaps because the source system knows sep is present) but any quotes were not escaped at all,
// so look for ", to define the end. (There might not be any quotes present to worry about, anyway).
// (ii) not-quoted but there is a quote at the beginning so it should have been; look for , at the end
Expand All @@ -550,7 +561,7 @@ static void Field(FieldParseContext *ctx)
{
const char *ch2 = ch;
while ((*++ch || ch < eof) && *ch != '\n' && *ch != '\r') {
if (*ch == quote && end_of_field(ch + 1)) {ch2 = ch; break;} // (*1) regular ", ending; leave *ch on closing quote
if (*ch == quote && end_of_field(ch + 1)) { ch2 = ch; break; } // (*1) regular ", ending; leave *ch on closing quote
if (*ch == sep) {
// first sep in this field
// if there is a ", afterwards but before the next \n, use that; the field was quoted and it's still case (i) above.
Expand Down Expand Up @@ -579,7 +590,7 @@ static void Field(FieldParseContext *ctx)
*ctx->ch = ch;
} else {
*ctx->ch = ch;
if (ch == eof && quoteRule != 2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
if (ch == eof && quoteRule != QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED
while(target->len > 0 && ((ch[-1] == ' ' && stripWhite) || ch[-1] == '\0')) { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST "
}
// Does end-of-field correspond to end-of-possible-NA?
Expand Down Expand Up @@ -1341,7 +1352,7 @@ int freadMain(freadMainArgs _args)
if (verbose) DTPRINT(_(" Using %d threads (omp_get_max_threads()=%d, nth=%d)\n"), nth, maxth, args.nth);
}

const uint64_t ui64 = NA_FLOAT64_I64;
static const uint64_t ui64 = NA_FLOAT64_I64;
memcpy(&NA_FLOAT64, &ui64, 8);

const int64_t nrowLimit = args.nrowLimit;
Expand Down Expand Up @@ -1378,7 +1389,9 @@ int freadMain(freadMainArgs _args)
}
disabled_parsers[CT_BOOL8_N] = !args.logical01;
disabled_parsers[CT_BOOL8_Y] = !args.logicalYN;
disabled_parsers[CT_ISO8601_DATE] = disabled_parsers[CT_ISO8601_TIME] = args.oldNoDateTime; // temporary new option in v1.13.0; see NEWS
disabled_parsers[CT_ISO8601_DATE] = args.oldNoDateTime; // temporary new option in v1.13.0; see NEWS
disabled_parsers[CT_ISO8601_TIME] = args.oldNoDateTime;

if (verbose) {
if (*NAstrings == NULL) {
DTPRINT(_(" No NAstrings provided.\n"));
Expand Down Expand Up @@ -1438,7 +1451,7 @@ int freadMain(freadMainArgs _args)
}
else if (args.filename) {
if (verbose) DTPRINT(_(" Opening file %s\n"), args.filename);
const char* fnam = args.filename;
const char *fnam = args.filename;
#ifndef WIN32
int fd = open(fnam, O_RDONLY);
if (fd == -1) STOP(_("Couldn't open file %s: %s"), fnam, strerror(errno));
Expand Down Expand Up @@ -1497,7 +1510,7 @@ int freadMain(freadMainArgs _args)
CloseHandle(hFile); // see https://msdn.microsoft.com/en-us/library/windows/desktop/aa366537(v=vs.85).aspx
if (mmp == NULL) {
#endif
int nbit = 8 * sizeof(char *); // #nocov
int nbit = 8 * sizeof(char*); // #nocov
STOP(_("Opened %s file ok but could not memory map it. This is a %dbit process. %s."), filesize_to_str(fileSize), nbit, // # nocov
nbit <= 32 ? _("Please upgrade to 64bit") : _("There is probably not enough contiguous virtual memory available")); // # nocov
}
Expand Down Expand Up @@ -1705,7 +1718,7 @@ int freadMain(freadMainArgs _args)
if (verbose) DTPRINT(_(" sep='\\n' passed in meaning read lines as single character column\n"));
sep = 127; // ASCII DEL: a character different from \r, \n and \0 that isn't in the data
whiteChar = 0;
quoteRule = 3; // Ignore quoting
quoteRule = QUOTE_RULE_IGNORE_QUOTES;
ncol = 1;
int thisLine = 0;
while (ch < eof && thisLine++ < jumpLines) {
Expand All @@ -1730,13 +1743,13 @@ int freadMain(freadMainArgs _args)
//topSep = args.sep;
if (verbose) DTPRINT(_(" Using supplied sep '%s'\n"), args.sep == '\t' ? "\\t" : seps);
}
int topNumLines = 0; // the most number of lines with the same number of fields, so far
int topNumFields = 1; // how many fields that was, to resolve ties
int topQuoteRule = -1; // which quote rule that was
int topSkip = 0; // how many rows to auto-skip
int topNumLines = 0; // the most number of lines with the same number of fields, so far
int topNumFields = 1; // how many fields that was, to resolve ties
enum quote_rule_t topQuoteRule = -1; // which quote rule that was
int topSkip = 0; // how many rows to auto-skip
const char *topStart = NULL;

for (quoteRule = quote ? 0 : 3; quoteRule < 4; quoteRule++) { // #loop_counter_not_local_scope_ok
for (quoteRule = quote ? QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED : QUOTE_RULE_IGNORE_QUOTES; quoteRule < QUOTE_RULE_COUNT; quoteRule++) { // #loop_counter_not_local_scope_ok
// quote rule in order of preference.
// when top is tied the first wins, so do all seps for the first quoteRule, then all seps for the second quoteRule, etc
for (int s = 0; s < nseps; s++) {
Expand All @@ -1746,7 +1759,7 @@ int freadMain(freadMainArgs _args)
// if (verbose) DTPRINT(_(" Trying sep='%c' with quoteRule %d ...\n"), sep, quoteRule);

if (fill) {
if (quoteRule > 1 && quote) continue; // turn off self-healing quote rule when filling
if (quoteRule > QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED && quote) continue; // turn off self-healing quote rule when filling
int firstRowNcol = countfields(&ch);
int thisncol = 0, maxncol = firstRowNcol, thisRow = 0;
while (ch < eof && ++thisRow < jumpLines) { // TODO: rename 'jumpLines' to 'jumpRows'
Expand Down Expand Up @@ -1801,7 +1814,7 @@ int freadMain(freadMainArgs _args)
if ((thisBlockLines > topNumLines && lastncol > 1) || // more lines wins even with fewer fields, so long as number of fields >= 2
(thisBlockLines == topNumLines &&
lastncol > topNumFields && // when number of lines is tied, choose the sep which separates it into more columns
(quoteRule < 2 || quoteRule <= topQuoteRule) && // for test 1834 where every line contains a correctly quoted field contain sep
(quoteRule < QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED || quoteRule <= topQuoteRule) && // for test 1834 where every line contains a correctly quoted field contain sep
(topNumFields <= 1 || sep != ' '))) {
topNumLines = thisBlockLines;
topNumFields = lastncol;
Expand All @@ -1828,8 +1841,8 @@ int freadMain(freadMainArgs _args)
ASSERT(topSep == 127, "Single column input has topSep=%d", topSep);
sep = topSep;
// no self healing quote rules, as we don't have >1 field to disambiguate
// choose quote rule 0 or 1 based on for which 100 rows gets furthest into file
for (quoteRule = 0; quoteRule <= 1; quoteRule++) { // #loop_counter_not_local_scope_ok
// choose QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED or QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED based on for which 100 rows gets furthest into file
for (quoteRule = QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED; quoteRule <= QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED; quoteRule++) { // #loop_counter_not_local_scope_ok
int thisRow = 0, thisncol = 0;
ch = pos;
while (ch < eof && ++thisRow < jumpLines && (thisncol = countfields(&ch)) >= 0) {};
Expand All @@ -1843,7 +1856,7 @@ int freadMain(freadMainArgs _args)
}

quoteRule = topQuoteRule;
if (quoteRule > 1 && quote) {
if (quoteRule > QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED && quote) {
DTWARN(_("Found and resolved improper quoting in first %d rows. If the fields are not quoted (e.g. field separator does not appear within any field), try quote=\"\" to avoid this warning."), jumpLines);
// TODO: include line number and text in warning. Could loop again with the standard quote rule to find the line that fails.
}
Expand Down Expand Up @@ -1890,8 +1903,8 @@ int freadMain(freadMainArgs _args)
DTPRINT(_(" File copy in RAM took %.3f seconds.\n"), time_taken);
else if (tt > 0.5) // # nocov
DTPRINT(_("Avoidable file copy in RAM took %.3f seconds. %s.\n"), time_taken, msg); // # nocov. not warning as that could feasibly cause CRAN tests to fail, say, if test machine is heavily loaded
pos = sof + (pos - (const char *)mmp);
firstJumpEnd = sof + (firstJumpEnd - (const char *)mmp);
pos = sof + (pos - (const char*)mmp);
firstJumpEnd = sof + (firstJumpEnd - (const char*)mmp);
} else {
if (verbose) DTPRINT(_(" 1-column file ends with 2 or more end-of-line. Restoring last eol using extra byte in cow page.\n"));
eof++;
Expand Down Expand Up @@ -2241,7 +2254,7 @@ int freadMain(freadMainArgs _args)
if (type[j] < tmpType[j]) {
if (strcmp(typeName[tmpType[j]], typeName[type[j]]) != 0) {
DTWARN(_("Attempt to override column %d%s%.*s%s of inherent type '%s' down to '%s' ignored. Only overrides to a higher type are currently supported. If this was intended, please coerce to the lower type afterwards."),
j + 1, colNames ? " <<" : "", colNames?(colNames[j].len) : 0, colNames ? (colNamesAnchor + colNames[j].off) : "", colNames ? ">>" : "", // #4644
j + 1, colNames ? " <<" : "", colNames ? (colNames[j].len) : 0, colNames ? (colNamesAnchor + colNames[j].off) : "", colNames ? ">>" : "", // #4644
typeName[tmpType[j]], typeName[type[j]]);
}
type[j] = tmpType[j];
Expand Down Expand Up @@ -2449,7 +2462,7 @@ int freadMain(freadMainArgs _args)
fun[IGNORE_BUMP(thisType)](&fctx);
if (*tch != sep) break;
int8_t thisSize = size[j];
if (thisSize) ((char **) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0
if (thisSize) ((char**) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0
tch++;
j++;
}
Expand All @@ -2463,7 +2476,7 @@ int freadMain(freadMainArgs _args)
}
else if (eol(&tch) && j < ncol) { // j<ncol needed for #2523 (erroneous extra comma after last field)
int8_t thisSize = size[j];
if (thisSize) ((char **) targets)[thisSize] += thisSize;
if (thisSize) ((char**) targets)[thisSize] += thisSize;
j++;
if (j > max_col) max_col = j;
if (j == ncol) { tch++; myNrow++; continue; } // next line. Back up to while (tch<nextJumpStart). Usually happens, fastest path
Expand Down Expand Up @@ -2633,7 +2646,7 @@ int freadMain(freadMainArgs _args)
ctx.nRows = myNrow;
orderBuffer(&ctx);
if (myStopEarly) {
if (quoteRule < 3) {
if (quoteRule < QUOTE_RULE_IGNORE_QUOTES) {
quoteRule++;
if (quoteRuleBumpedCh == NULL) {
// for warning message if the quote rule bump does in fact manage to heal it, e.g. test 1881
Expand Down
Loading