diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 442891949dfd2..68027019e4c3f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h": SKIP_LINE FINISHED - enum: ERROR_OVERFLOW + enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS ctypedef enum BadLineHandleMethod: ERROR, @@ -1058,7 +1058,7 @@ cdef class TextReader: if col_dtype is not None: col_res, na_count = self._convert_with_dtype( col_dtype, i, start, end, na_filter, - 1, na_hashset, na_fset) + 1, na_hashset, na_fset, False) # Fallback on the parse (e.g. we requested int dtype, # but its actually a float). @@ -1069,22 +1069,26 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) else: col_res = None + maybe_int = True for dt in self.dtype_cast_order: - if (dt.kind in "iu" and - self._column_has_float(i, start, end, na_filter, na_hashset)): + if not maybe_int and dt.kind in "iu": continue try: col_res, na_count = self._convert_with_dtype( - dt, i, start, end, na_filter, 0, na_hashset, na_fset) - except ValueError: - # This error is raised from trying to convert to uint64, - # and we discover that we cannot convert to any numerical - # dtype successfully. As a result, we leave the data - # column AS IS with object dtype. - col_res, na_count = self._convert_with_dtype( - np.dtype("object"), i, start, end, 0, - 0, na_hashset, na_fset) + dt, i, start, end, na_filter, 0, na_hashset, na_fset, True) + except ValueError as e: + if str(e) == "Number is not int": + maybe_int = False + continue + else: + # This error is raised from trying to convert to uint64, + # and we discover that we cannot convert to any numerical + # dtype successfully. As a result, we leave the data + # column AS IS with object dtype. + col_res, na_count = self._convert_with_dtype( + np.dtype("object"), i, start, end, 0, + 0, na_hashset, na_fset, False) except OverflowError: try: col_res, na_count = _try_pylong(self.parser, i, start, @@ -1092,7 +1096,7 @@ cdef class TextReader: except ValueError: col_res, na_count = self._convert_with_dtype( np.dtype("object"), i, start, end, 0, - 0, na_hashset, na_fset) + 0, na_hashset, na_fset, False) if col_res is not None: break @@ -1140,7 +1144,7 @@ cdef class TextReader: bint na_filter, bint user_dtype, kh_str_starts_t *na_hashset, - set na_fset): + set na_fset, bint raise_on_float): if isinstance(dtype, CategoricalDtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype @@ -1181,14 +1185,14 @@ cdef class TextReader: elif dtype.kind in "iu": try: - result, na_count = _try_int64(self.parser, i, start, - end, na_filter, na_hashset) + result, na_count = _try_int64(self.parser, i, start, end, + na_filter, na_hashset, raise_on_float) if user_dtype and na_count is not None: if na_count > 0: raise ValueError(f"Integer column has NA values in column {i}") except OverflowError: result = _try_uint64(self.parser, i, start, end, - na_filter, na_hashset) + na_filter, na_hashset, raise_on_float) na_count = 0 if result is not None and dtype != "int64": @@ -1351,59 +1355,6 @@ cdef class TextReader: else: return None - cdef bint _column_has_float(self, Py_ssize_t col, - int64_t start, int64_t end, - bint na_filter, kh_str_starts_t *na_hashset): - """Check if the column contains any float number.""" - cdef: - Py_ssize_t i, j, lines = end - start - coliter_t it - const char *word = NULL - const char *ignored_chars = " +-" - const char *digits = "0123456789" - const char *float_indicating_chars = "eE" - char null_byte = 0 - - coliter_setup(&it, self.parser, col, start) - - for i in range(lines): - COLITER_NEXT(it, word) - - if na_filter and kh_get_str_starts_item(na_hashset, word): - continue - - found_first_digit = False - j = 0 - while word[j] != null_byte: - if word[j] == self.parser.decimal: - return True - elif not found_first_digit and word[j] in ignored_chars: - # no-op - pass - elif not found_first_digit and word[j] not in digits: - # word isn't numeric - return False - elif not found_first_digit and word[j] in digits: - found_first_digit = True - elif word[j] in float_indicating_chars: - # preceding chars indicates numeric and - # current char indicates float - return True - elif word[j] not in digits: - # previous characters indicates numeric - # current character shows otherwise - return False - elif word[j] in digits: - # no-op - pass - else: - raise AssertionError( - f"Unhandled case {word[j]=} {found_first_digit=}" - ) - j += 1 - - return False - # Factor out code common to TextReader.__dealloc__ and TextReader.close # It cannot be a class method, since calling self.close() in __dealloc__ # which causes a class attribute lookup and violates best practices @@ -1800,7 +1751,8 @@ cdef int _try_double_nogil(parser_t *parser, cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_starts_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset, + bint raise_on_float): cdef: int error Py_ssize_t lines @@ -1822,7 +1774,10 @@ cdef _try_uint64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") - return None + elif raise_on_float and error == ERROR_INVALID_CHARS: + raise ValueError("Number is not int") + elif not raise_on_float or error != ERROR_INVALID_CHARS: + return None if uint64_conflict(&state): raise ValueError("Cannot convert to numerical dtype") @@ -1872,7 +1827,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col, cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_starts_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float): cdef: int error, na_count = 0 Py_ssize_t lines @@ -1892,7 +1847,10 @@ cdef _try_int64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") - return None, None + elif raise_on_float and error == ERROR_INVALID_CHARS: + raise ValueError("Number is not int") + elif not raise_on_float or error != ERROR_INVALID_CHARS: + return None, None return result, na_count diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 61e96fc835e4d..1561c16e4fd6c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1834,6 +1834,44 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } +/** + * @brief Validates that a string contains only numeric digits and optional + * trailing whitespace. + * + * This function is used after an integer overflow, + * where is checks the rest of the string for a non-numeric character, + * while also ignoring trailing white-space. + * + * Pure integer overflows during CSV parsing are converted to PyLongObjects, + * while, if any invalid character is found, it skips integer + * parsing and tries other conversion methods. + * + * @param p_item Pointer to the string to validate for numeric format + * @param error Pointer to indicate error code. + * Set to ERROR_INVALID_CHARS if an invalid character is found. + * + * @return Pointer to the position in the string where validation stopped. + * - If valid: terminates at the null terminator. + * - If invalid: points to the first invalid character encountered. + */ +static inline const char *check_for_invalid_char(const char *p_item, + int *error) { + while (*p_item != '\0' && isdigit_ascii(*p_item)) { + p_item++; + } + + while (*p_item != '\0' && isspace_ascii(*p_item)) { + ++p_item; + } + + // check if reached the end of string after consuming all digits + if (*p_item != '\0') { + *error = ERROR_INVALID_CHARS; + } + + return p_item; +} + int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { const char *p = p_item; @@ -1879,6 +1917,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, d = *++p; } else { *error = ERROR_OVERFLOW; + check_for_invalid_char(p, error); return 0; } } @@ -1890,6 +1929,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, d = *++p; } else { *error = ERROR_OVERFLOW; + check_for_invalid_char(p, error); return 0; } } @@ -1917,6 +1957,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } else { *error = ERROR_OVERFLOW; + check_for_invalid_char(p, error); return 0; } } @@ -1929,6 +1970,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } else { *error = ERROR_OVERFLOW; + check_for_invalid_char(p, error); return 0; } } @@ -1997,6 +2039,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } else { *error = ERROR_OVERFLOW; + check_for_invalid_char(p, error); return 0; } } @@ -2009,6 +2052,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } else { *error = ERROR_OVERFLOW; + check_for_invalid_char(p, error); return 0; } }