Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 33 additions & 75 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
SKIP_LINE
FINISHED

enum: ERROR_OVERFLOW
enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS

ctypedef enum BadLineHandleMethod:
ERROR,
Expand Down Expand Up @@ -1058,7 +1058,7 @@ cdef class TextReader:
if col_dtype is not None:
col_res, na_count = self._convert_with_dtype(
col_dtype, i, start, end, na_filter,
1, na_hashset, na_fset)
1, na_hashset, na_fset, False)

# Fallback on the parse (e.g. we requested int dtype,
# but its actually a float).
Expand All @@ -1069,30 +1069,34 @@ cdef class TextReader:
return self._string_convert(i, start, end, na_filter, na_hashset)
else:
col_res = None
maybe_int = True
for dt in self.dtype_cast_order:
if (dt.kind in "iu" and
self._column_has_float(i, start, end, na_filter, na_hashset)):
if not maybe_int and dt.kind in "iu":
continue

try:
col_res, na_count = self._convert_with_dtype(
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
except ValueError:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
except ValueError as e:
if str(e) == "Number is not int":
maybe_int = False
continue
else:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset, False)
except OverflowError:
try:
col_res, na_count = _try_pylong(self.parser, i, start,
end, na_filter, na_hashset)
except ValueError:
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
0, na_hashset, na_fset, False)

if col_res is not None:
break
Expand Down Expand Up @@ -1140,7 +1144,7 @@ cdef class TextReader:
bint na_filter,
bint user_dtype,
kh_str_starts_t *na_hashset,
set na_fset):
set na_fset, bint raise_on_float):
if isinstance(dtype, CategoricalDtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
Expand Down Expand Up @@ -1181,14 +1185,14 @@ cdef class TextReader:

elif dtype.kind in "iu":
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
result, na_count = _try_int64(self.parser, i, start, end,
na_filter, na_hashset, raise_on_float)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError(f"Integer column has NA values in column {i}")
except OverflowError:
result = _try_uint64(self.parser, i, start, end,
na_filter, na_hashset)
na_filter, na_hashset, raise_on_float)
na_count = 0

if result is not None and dtype != "int64":
Expand Down Expand Up @@ -1351,59 +1355,6 @@ cdef class TextReader:
else:
return None

cdef bint _column_has_float(self, Py_ssize_t col,
int64_t start, int64_t end,
bint na_filter, kh_str_starts_t *na_hashset):
"""Check if the column contains any float number."""
cdef:
Py_ssize_t i, j, lines = end - start
coliter_t it
const char *word = NULL
const char *ignored_chars = " +-"
const char *digits = "0123456789"
const char *float_indicating_chars = "eE"
char null_byte = 0

coliter_setup(&it, self.parser, col, start)

for i in range(lines):
COLITER_NEXT(it, word)

if na_filter and kh_get_str_starts_item(na_hashset, word):
continue

found_first_digit = False
j = 0
while word[j] != null_byte:
if word[j] == self.parser.decimal:
return True
elif not found_first_digit and word[j] in ignored_chars:
# no-op
pass
elif not found_first_digit and word[j] not in digits:
# word isn't numeric
return False
elif not found_first_digit and word[j] in digits:
found_first_digit = True
elif word[j] in float_indicating_chars:
# preceding chars indicates numeric and
# current char indicates float
return True
elif word[j] not in digits:
# previous characters indicates numeric
# current character shows otherwise
return False
elif word[j] in digits:
# no-op
pass
else:
raise AssertionError(
f"Unhandled case {word[j]=} {found_first_digit=}"
)
j += 1

return False

# Factor out code common to TextReader.__dealloc__ and TextReader.close
# It cannot be a class method, since calling self.close() in __dealloc__
# which causes a class attribute lookup and violates best practices
Expand Down Expand Up @@ -1800,7 +1751,8 @@ cdef int _try_double_nogil(parser_t *parser,

cdef _try_uint64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset,
bint raise_on_float):
cdef:
int error
Py_ssize_t lines
Expand All @@ -1822,7 +1774,10 @@ cdef _try_uint64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None

if uint64_conflict(&state):
raise ValueError("Cannot convert to numerical dtype")
Expand Down Expand Up @@ -1872,7 +1827,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,

cdef _try_int64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
cdef:
int error, na_count = 0
Py_ssize_t lines
Expand All @@ -1892,7 +1847,10 @@ cdef _try_int64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None, None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None, None

return result, na_count

Expand Down
21 changes: 21 additions & 0 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1834,6 +1834,21 @@ int uint64_conflict(uint_state *self) {
return self->seen_uint && (self->seen_sint || self->seen_null);
}

static inline void check_for_invalid_char(const char *p_item, int *error) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you document what this function does? The name check_for_invalid_char is a bit too vague - this is better described as something like cast_char_p_as_float no?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd also suggest that you either return int and drop the int * argument, or return something useful (ex: return the parsed float value) and then set the pointer value

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a Doxygen comment. It's also returning the pointer to the last verified character.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kept the error pointer in the function to prevent code duplication, and also because the main purpose of the function is just to assign a value to it. Considering that it's now returning the position of the last verified character, it's possible to change the error value outside the function, but I think it's more clean the way it is.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it returning a char *? It doesn't look like you are using the return value anywhere.

I'm leaning more towards the former approach unless there is a reason for it to return a value; its really common practice to return an integral code to denote an error or not (even in C++ you'll see Arrow do this all over the place). Tucking that return value away in a pointer is far less common.

Its also more performant to return the int value directly, although in this particular case that's probably too far down the stack to be noticable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed it to return a status code.

while (*p_item != '\0' && isdigit_ascii(*p_item)) {
p_item++;
}

while (*p_item != '\0' && isspace_ascii(*p_item)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be combined with the previous loop? Is there a reason for trailing whitespace to be handed specially, or is there a reason at all to allow whitespace?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It needs a separate loop because this case should be invalid "7890123 1351713789"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we allow trailing white space though?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is permitted below too if an overflow doesn't occur. I added it in this function to make it consistent.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we doing this at all though? Are we stripping trailing whitespace for any other case in the tokenizer?

Copy link
Contributor Author

@Alvaro-Kothe Alvaro-Kothe Oct 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we doing this at all though?

I couldn't find any particular reason in the code to do this.

Are we stripping trailing whitespace for any other case in the tokenizer?

Every function that parses a string in tokenizer.c ignores leading and trailing whitespace.

I changed the behavior to just check for the character after consuming all digits. The way that the function was used before didn't change the pointer position after calling it.

This function now don't permit trailing whitespace.

++p_item;
}

// check if reached the end of string after consuming all digits
if (*p_item != '\0') {
*error = ERROR_INVALID_CHARS;
}
}

int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep) {
const char *p = p_item;
Expand Down Expand Up @@ -1879,6 +1894,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
d = *++p;
} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand All @@ -1890,6 +1906,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
d = *++p;
} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand Down Expand Up @@ -1917,6 +1934,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand All @@ -1929,6 +1947,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand Down Expand Up @@ -1997,6 +2016,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand All @@ -2009,6 +2029,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,

} else {
*error = ERROR_OVERFLOW;
check_for_invalid_char(p, error);
return 0;
}
}
Expand Down
Loading