Skip to content
108 changes: 33 additions & 75 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
SKIP_LINE
FINISHED

enum: ERROR_OVERFLOW
enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS

ctypedef enum BadLineHandleMethod:
ERROR,
Expand Down Expand Up @@ -1058,7 +1058,7 @@ cdef class TextReader:
if col_dtype is not None:
col_res, na_count = self._convert_with_dtype(
col_dtype, i, start, end, na_filter,
1, na_hashset, na_fset)
1, na_hashset, na_fset, False)

# Fallback on the parse (e.g. we requested int dtype,
# but its actually a float).
Expand All @@ -1069,30 +1069,34 @@ cdef class TextReader:
return self._string_convert(i, start, end, na_filter, na_hashset)
else:
col_res = None
maybe_int = True
for dt in self.dtype_cast_order:
if (dt.kind in "iu" and
self._column_has_float(i, start, end, na_filter, na_hashset)):
if not maybe_int and dt.kind in "iu":
continue

try:
col_res, na_count = self._convert_with_dtype(
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
except ValueError:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
except ValueError as e:
if str(e) == "Number is not int":
maybe_int = False
continue
else:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset, False)
except OverflowError:
try:
col_res, na_count = _try_pylong(self.parser, i, start,
end, na_filter, na_hashset)
except ValueError:
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
0, na_hashset, na_fset, False)

if col_res is not None:
break
Expand Down Expand Up @@ -1140,7 +1144,7 @@ cdef class TextReader:
bint na_filter,
bint user_dtype,
kh_str_starts_t *na_hashset,
set na_fset):
set na_fset, bint raise_on_float):
if isinstance(dtype, CategoricalDtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
Expand Down Expand Up @@ -1181,14 +1185,14 @@ cdef class TextReader:

elif dtype.kind in "iu":
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
result, na_count = _try_int64(self.parser, i, start, end,
na_filter, na_hashset, raise_on_float)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError(f"Integer column has NA values in column {i}")
except OverflowError:
result = _try_uint64(self.parser, i, start, end,
na_filter, na_hashset)
na_filter, na_hashset, raise_on_float)
na_count = 0

if result is not None and dtype != "int64":
Expand Down Expand Up @@ -1351,59 +1355,6 @@ cdef class TextReader:
else:
return None

cdef bint _column_has_float(self, Py_ssize_t col,
int64_t start, int64_t end,
bint na_filter, kh_str_starts_t *na_hashset):
"""Check if the column contains any float number."""
cdef:
Py_ssize_t i, j, lines = end - start
coliter_t it
const char *word = NULL
const char *ignored_chars = " +-"
const char *digits = "0123456789"
const char *float_indicating_chars = "eE"
char null_byte = 0

coliter_setup(&it, self.parser, col, start)

for i in range(lines):
COLITER_NEXT(it, word)

if na_filter and kh_get_str_starts_item(na_hashset, word):
continue

found_first_digit = False
j = 0
while word[j] != null_byte:
if word[j] == self.parser.decimal:
return True
elif not found_first_digit and word[j] in ignored_chars:
# no-op
pass
elif not found_first_digit and word[j] not in digits:
# word isn't numeric
return False
elif not found_first_digit and word[j] in digits:
found_first_digit = True
elif word[j] in float_indicating_chars:
# preceding chars indicates numeric and
# current char indicates float
return True
elif word[j] not in digits:
# previous characters indicates numeric
# current character shows otherwise
return False
elif word[j] in digits:
# no-op
pass
else:
raise AssertionError(
f"Unhandled case {word[j]=} {found_first_digit=}"
)
j += 1

return False

# Factor out code common to TextReader.__dealloc__ and TextReader.close
# It cannot be a class method, since calling self.close() in __dealloc__
# which causes a class attribute lookup and violates best practices
Expand Down Expand Up @@ -1800,7 +1751,8 @@ cdef int _try_double_nogil(parser_t *parser,

cdef _try_uint64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset,
bint raise_on_float):
cdef:
int error
Py_ssize_t lines
Expand All @@ -1822,7 +1774,10 @@ cdef _try_uint64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None

if uint64_conflict(&state):
raise ValueError("Cannot convert to numerical dtype")
Expand Down Expand Up @@ -1872,7 +1827,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,

cdef _try_int64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
cdef:
int error, na_count = 0
Py_ssize_t lines
Expand All @@ -1892,7 +1847,10 @@ cdef _try_int64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None, None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None, None

return result, na_count

Expand Down
44 changes: 44 additions & 0 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1834,6 +1834,44 @@ int uint64_conflict(uint_state *self) {
return self->seen_uint && (self->seen_sint || self->seen_null);
}

/**
* @brief Validates that a string contains only numeric digits and optional
* trailing whitespace.
*
* This function is used after an integer overflow,
* where is checks the rest of the string for a non-numeric character,
* while also ignoring trailing white-space.
*
* Pure integer overflows during CSV parsing are converted to PyLongObjects,
* while, if any invalid character is found, it skips integer
* parsing and tries other conversion methods.
*
* @param p_item Pointer to the string to validate for numeric format
* @param error Pointer to indicate error code.
* Set to ERROR_INVALID_CHARS if an invalid character is found.
*
* @return Pointer to the position in the string where validation stopped.
* - If valid: terminates at the null terminator.
* - If invalid: points to the first invalid character encountered.
*/
static inline const char *check_for_invalid_char(const char *p_item,
int *error) {
while (*p_item != '\0' && isdigit_ascii(*p_item)) {
p_item++;
}

while (*p_item != '\0' && isspace_ascii(*p_item)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be combined with the previous loop? Is there a reason for trailing whitespace to be handed specially, or is there a reason at all to allow whitespace?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It needs a separate loop because this case should be invalid "7890123 1351713789"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we allow trailing white space though?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is permitted below too if an overflow doesn't occur. I added it in this function to make it consistent.

++p_item;
}

// check if reached the end of string after consuming all digits
if (*p_item != '\0') {
*error = ERROR_INVALID_CHARS;
}

return p_item;
}

int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep) {
const char *p = p_item;
Expand Down Expand Up @@ -1879,6 +1917,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
d = *++p;
} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand All @@ -1890,6 +1929,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
d = *++p;
} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand Down Expand Up @@ -1917,6 +1957,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand All @@ -1929,6 +1970,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand Down Expand Up @@ -1997,6 +2039,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand All @@ -2009,6 +2052,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand Down
Loading