Skip to content

Commit 9c54bd0

Browse files
Add helper macros for the type bump system (#6983)
* type bump macros (picking a name) * more applications * back to TOGGLE_BUMP
1 parent 1593221 commit 9c54bd0

File tree

1 file changed

+21
-13
lines changed

1 file changed

+21
-13
lines changed

src/fread.c

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,14 @@ static void Field(FieldParseContext *ctx);
109109
#define AS_DIGIT(x) (uint_fast8_t)(x - '0')
110110
#define IS_DIGIT(x) (AS_DIGIT(x) < 10)
111111

112+
// Readability helpers for the ad-hoc type bumping system: flip e.g.
113+
// type '1' to '-1' to mark out-of-sample type bumps, i.e., cases
114+
// where the type inferred during auto-detection (which only uses
115+
// a sample of rows) winds up being incorrect when considering the
116+
// full input.
117+
#define IGNORE_BUMP(x) abs(x)
118+
#define TOGGLE_BUMP(x) (-x)
119+
112120
//=================================================================================================
113121
//
114122
// Utility functions
@@ -220,11 +228,11 @@ static char *typesAsString(int ncol) {
220228
static char str[101];
221229
int i=0;
222230
if (ncol<=100) {
223-
for (; i<ncol; i++) str[i] = typeLetter[abs(type[i])]; // abs for out-of-sample type bumps (negative)
231+
for (; i<ncol; i++) str[i] = typeLetter[IGNORE_BUMP(type[i])];
224232
} else {
225-
for (; i<80; i++) str[i] = typeLetter[abs(type[i])];
233+
for (; i<80; i++) str[i] = typeLetter[IGNORE_BUMP(type[i])];
226234
str[i++]='.'; str[i++]='.'; str[i++]='.';
227-
for (int j=ncol-10; j<ncol; j++) str[i++] = typeLetter[abs(type[j])];
235+
for (int j=ncol-10; j<ncol; j++) str[i++] = typeLetter[IGNORE_BUMP(type[j])];
228236
}
229237
str[i] = '\0';
230238
return str;
@@ -2405,7 +2413,7 @@ int freadMain(freadMainArgs _args) {
24052413
// DTPRINT(_("Field %d: '%.10s' as type %d (tch=%p)\n"), j+1, tch, type[j], tch);
24062414
fieldStart = tch;
24072415
int8_t thisType = type[j]; // fetch shared type once. Cannot read half-written byte is one reason type's type is single byte to avoid atomic read here.
2408-
fun[abs(thisType)](&fctx);
2416+
fun[IGNORE_BUMP(thisType)](&fctx);
24092417
if (*tch!=sep) break;
24102418
int8_t thisSize = size[j];
24112419
if (thisSize) ((char **) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0
@@ -2455,7 +2463,7 @@ int freadMain(freadMainArgs _args) {
24552463
fieldStart = tch;
24562464
int8_t joldType = type[j];
24572465
int8_t thisType = joldType; // to know if it was bumped in (rare) out-of-sample type exceptions
2458-
int8_t absType = (int8_t)abs(thisType);
2466+
int8_t absType = (int8_t)IGNORE_BUMP(thisType);
24592467

24602468
while (absType < NUMTYPE) {
24612469
tch = fieldStart;
@@ -2468,7 +2476,7 @@ int freadMain(freadMainArgs _args) {
24682476
if (!end_of_field(tch)) tch = afterSpace; // else it is the field_end, we're on closing sep|eol and we'll let processor write appropriate NA as if field was empty
24692477
if (*tch==quote && quote) { quoted=true; tch++; }
24702478
} // else Field() handles NA inside it unlike other processors e.g. ,, is interpreted as "" or NA depending on option read inside Field()
2471-
fun[abs(thisType)](&fctx);
2479+
fun[IGNORE_BUMP(thisType)](&fctx);
24722480
if (quoted) { // quoted was only set to true with '&& quote' above (=> quote!='\0' now)
24732481
if (*tch==quote) tch++;
24742482
else goto typebump;
@@ -2487,7 +2495,7 @@ int freadMain(freadMainArgs _args) {
24872495
// sure a single re-read will definitely work.
24882496
typebump:
24892497
while (++absType<CT_STRING && disabled_parsers[absType]) {};
2490-
thisType = -absType;
2498+
thisType = TOGGLE_BUMP(absType);
24912499
tch = fieldStart;
24922500
}
24932501

@@ -2499,7 +2507,7 @@ int freadMain(freadMainArgs _args) {
24992507
if (j+fieldsRemaining != ncol) break;
25002508
checkedNumberOfFields = true;
25012509
}
2502-
if (thisType <= -NUMTYPE) {
2510+
if (thisType <= TOGGLE_BUMP(NUMTYPE)) {
25032511
break; // Improperly quoted char field needs to be healed below, other columns will be filled #5041 and #4774
25042512
}
25052513
#pragma omp critical
@@ -2512,7 +2520,7 @@ int freadMain(freadMainArgs _args) {
25122520
int len = snprintf(temp, 1000,
25132521
_("Column %d%s%.*s%s bumped from '%s' to '%s' due to <<%.*s>> on row %"PRIu64"\n"),
25142522
j+1, colNames?" <<":"", colNames?(colNames[j].len):0, colNames?(colNamesAnchor+colNames[j].off):"", colNames?">>":"",
2515-
typeName[abs(joldType)], typeName[abs(thisType)],
2523+
typeName[IGNORE_BUMP(joldType)], typeName[IGNORE_BUMP(thisType)],
25162524
(int)(tch-fieldStart), fieldStart, (uint64_t)(ctx.DTi+myNrow));
25172525
if (len > 1000) len = 1000;
25182526
if (len > 0) {
@@ -2675,10 +2683,10 @@ int freadMain(freadMainArgs _args) {
26752683
if (firstTime) {
26762684
tReread = tRead = wallclock();
26772685

2678-
// if nTypeBump>0, not-bumped columns are about to be assigned parse type -CT_STRING for the reread, so we have to count
2686+
// if nTypeBump>0, not-bumped columns are about to be assigned parse type TOGGLE_BUMP(CT_STRING) for the reread, so we have to count
26792687
// parse types now (for log). We can't count final column types afterwards because many parse types map to the same column type.
26802688
for (int i=0; i<NUMTYPE; i++) typeCounts[i] = 0;
2681-
for (int i=0; i<ncol; i++) typeCounts[ abs(type[i]) ]++;
2689+
for (int i=0; i<ncol; i++) typeCounts[ IGNORE_BUMP(type[i]) ]++;
26822690

26832691
if (nTypeBump) {
26842692
if (verbose) DTPRINT(_(" %d out-of-sample type bumps: %s\n"), nTypeBump, typesAsString(ncol));
@@ -2689,7 +2697,7 @@ int freadMain(freadMainArgs _args) {
26892697
if (type[j] == CT_DROP) continue;
26902698
if (type[j]<0) {
26912699
// column was bumped due to out-of-sample type exception
2692-
type[j] = -type[j];
2700+
type[j] = TOGGLE_BUMP(type[j]);
26932701
size[j] = typeSize[type[j]];
26942702
rowSize1 += (size[j] & 1);
26952703
rowSize4 += (size[j] & 4);
@@ -2698,7 +2706,7 @@ int freadMain(freadMainArgs _args) {
26982706
} else if (type[j]>=1) {
26992707
// we'll skip over non-bumped columns in the rerun, whilst still incrementing resi (hence not CT_DROP)
27002708
// not -type[i] either because that would reprocess the contents of not-bumped columns wastefully
2701-
type[j] = -CT_STRING;
2709+
type[j] = TOGGLE_BUMP(CT_STRING);
27022710
size[j] = 0;
27032711
}
27042712
}

0 commit comments

Comments
 (0)