Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 33 additions & 75 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
SKIP_LINE
FINISHED

enum: ERROR_OVERFLOW
enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS

ctypedef enum BadLineHandleMethod:
ERROR,
Expand Down Expand Up @@ -1058,7 +1058,7 @@ cdef class TextReader:
if col_dtype is not None:
col_res, na_count = self._convert_with_dtype(
col_dtype, i, start, end, na_filter,
1, na_hashset, na_fset)
1, na_hashset, na_fset, False)

# Fallback on the parse (e.g. we requested int dtype,
# but its actually a float).
Expand All @@ -1069,30 +1069,34 @@ cdef class TextReader:
return self._string_convert(i, start, end, na_filter, na_hashset)
else:
col_res = None
maybe_int = True
for dt in self.dtype_cast_order:
if (dt.kind in "iu" and
self._column_has_float(i, start, end, na_filter, na_hashset)):
if not maybe_int and dt.kind in "iu":
continue

try:
col_res, na_count = self._convert_with_dtype(
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
except ValueError:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
except ValueError as e:
if str(e) == "Number is not int":
maybe_int = False
continue
else:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset, False)
except OverflowError:
try:
col_res, na_count = _try_pylong(self.parser, i, start,
end, na_filter, na_hashset)
except ValueError:
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
0, na_hashset, na_fset, False)

if col_res is not None:
break
Expand Down Expand Up @@ -1140,7 +1144,7 @@ cdef class TextReader:
bint na_filter,
bint user_dtype,
kh_str_starts_t *na_hashset,
set na_fset):
set na_fset, bint raise_on_float):
if isinstance(dtype, CategoricalDtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
Expand Down Expand Up @@ -1181,14 +1185,14 @@ cdef class TextReader:

elif dtype.kind in "iu":
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
result, na_count = _try_int64(self.parser, i, start, end,
na_filter, na_hashset, raise_on_float)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError(f"Integer column has NA values in column {i}")
except OverflowError:
result = _try_uint64(self.parser, i, start, end,
na_filter, na_hashset)
na_filter, na_hashset, raise_on_float)
na_count = 0

if result is not None and dtype != "int64":
Expand Down Expand Up @@ -1351,59 +1355,6 @@ cdef class TextReader:
else:
return None

cdef bint _column_has_float(self, Py_ssize_t col,
int64_t start, int64_t end,
bint na_filter, kh_str_starts_t *na_hashset):
"""Check if the column contains any float number."""
cdef:
Py_ssize_t i, j, lines = end - start
coliter_t it
const char *word = NULL
const char *ignored_chars = " +-"
const char *digits = "0123456789"
const char *float_indicating_chars = "eE"
char null_byte = 0

coliter_setup(&it, self.parser, col, start)

for i in range(lines):
COLITER_NEXT(it, word)

if na_filter and kh_get_str_starts_item(na_hashset, word):
continue

found_first_digit = False
j = 0
while word[j] != null_byte:
if word[j] == self.parser.decimal:
return True
elif not found_first_digit and word[j] in ignored_chars:
# no-op
pass
elif not found_first_digit and word[j] not in digits:
# word isn't numeric
return False
elif not found_first_digit and word[j] in digits:
found_first_digit = True
elif word[j] in float_indicating_chars:
# preceding chars indicates numeric and
# current char indicates float
return True
elif word[j] not in digits:
# previous characters indicates numeric
# current character shows otherwise
return False
elif word[j] in digits:
# no-op
pass
else:
raise AssertionError(
f"Unhandled case {word[j]=} {found_first_digit=}"
)
j += 1

return False

# Factor out code common to TextReader.__dealloc__ and TextReader.close
# It cannot be a class method, since calling self.close() in __dealloc__
# which causes a class attribute lookup and violates best practices
Expand Down Expand Up @@ -1800,7 +1751,8 @@ cdef int _try_double_nogil(parser_t *parser,

cdef _try_uint64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset,
bint raise_on_float):
cdef:
int error
Py_ssize_t lines
Expand All @@ -1822,7 +1774,10 @@ cdef _try_uint64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None

if uint64_conflict(&state):
raise ValueError("Cannot convert to numerical dtype")
Expand Down Expand Up @@ -1872,7 +1827,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,

cdef _try_int64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
cdef:
int error, na_count = 0
Py_ssize_t lines
Expand All @@ -1892,7 +1847,10 @@ cdef _try_int64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None, None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None, None

return result, na_count

Expand Down
52 changes: 52 additions & 0 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1834,6 +1834,34 @@ int uint64_conflict(uint_state *self) {
return self->seen_uint && (self->seen_sint || self->seen_null);
}

/**
* @brief Validates that a string contains only numeric digits.
*
* This function is used after an integer overflow,
* where is checks the rest of the string for a non-numeric character.
*
* Pure integer overflows during CSV parsing are converted to PyLongObjects,
* while, if any invalid character is found, it skips integer
* parsing and tries other conversion methods.
*
* @param p_item Pointer to the string to validate for numeric format
*
* @return Integer 0 if the remainder of the string contains only digits,
* otherwise returns the error code for [ERROR_INVALID_CHARS].
*/
static inline int check_for_invalid_char(const char *p_item) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add the length of the string as an argument? I realize this is a static function, but its still best to guard against buffer overruns in case of future refactor

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This information is not available in any of the parent functions. So I would have to call strlen to use it. I don't see much value in it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its about minimizing the risk during refactor. C is not an inherently safe language, so you need to be somewhat paranoid when writing functions.

You are correct in that at face value calling strlen is pretty...well dumb. But its a sign that a refactor can happen in another PR to better keep track of the length of a string while processing it

while (*p_item != '\0' && isdigit_ascii(*p_item)) {
p_item++;
}

// check if reached the end of string after consuming all digits
if (*p_item != '\0') {
return ERROR_INVALID_CHARS;
}

return 0;
}

int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep) {
const char *p = p_item;
Expand Down Expand Up @@ -1879,6 +1907,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
d = *++p;
} else {
*error = ERROR_OVERFLOW;
int status = check_for_invalid_char(p);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why tokenizer.h defines errors individually, but it would be much easier if you created a struct like:

enum TokenizerError {
    TOKENIZER_OK,
    ERROR_NO_DIGITS,
    ERROR_OVERFLOW,
    ERROR_INVALID_CHARS
};

Then you can just assign *error = check_for_invalid_char(p) and let the call stack naturally handle this (where 0 is no error).

Although its still a bit strange to apply an error on the preceding line then reassign it here. I wonder if there shouldn't be a generic function to check for errors here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also think the name ERROR_INVALID_CHARS is a little vague - maybe ERROR_INVALID_FLOAT_STRING is better?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why tokenizer.h defines errors individually, but it would be much easier if you created a struct

I even was thinking about doing it, but decided against it, thinking that this refactor would pollute this PR.

I also think the name ERROR_INVALID_CHARS is a little vague - maybe ERROR_INVALID_FLOAT_STRING is better?

This function checks for any invalid character, not specific to floating characters, so I think that ERROR_INVALID_CHARS is preferable.

I wonder if there shouldn't be a generic function to check for errors here

Like a function that overwrite the current error by some precedence?

For example: TOKENIZER_OK < ERROR_NO_DIGITS < ERROR_OVERFLOW < ERROR_INVALID_CHARS
If it's currently TOKENIZER_OK it would always be overwritten. ERROR_INVALID_CHARS would take precedence above all: would always substitute the current error and will never be overwritten?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although its still a bit strange to apply an error on the preceding line then reassign it here.

Another option would be an if-else statement. The overflow error should always be assigned if we don't find an invalid character.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I refactored the code for int64 and uint64 to use the enum. Additionally, I preferred to use if-else to assign the error code.

Another PR would be required to create and handle errors in other tokenizers.

if (status != 0) {
*error = status;
}
return 0;
}
}
Expand All @@ -1890,6 +1922,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
d = *++p;
} else {
*error = ERROR_OVERFLOW;
int status = check_for_invalid_char(p);
if (status != 0) {
*error = status;
}
return 0;
}
}
Expand Down Expand Up @@ -1917,6 +1953,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
int status = check_for_invalid_char(p);
if (status != 0) {
*error = status;
}
return 0;
}
}
Expand All @@ -1929,6 +1969,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
int status = check_for_invalid_char(p);
if (status != 0) {
*error = status;
}
return 0;
}
}
Expand Down Expand Up @@ -1997,6 +2041,10 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
int status = check_for_invalid_char(p);
if (status != 0) {
*error = status;
}
return 0;
}
}
Expand All @@ -2009,6 +2057,10 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
int status = check_for_invalid_char(p);
if (status != 0) {
*error = status;
}
return 0;
}
}
Expand Down
Loading