Skip to content

Commit c3cc4a1

Browse files
committed
refactor(parser): use integer parsing functions from stdlib
1 parent 1863adb commit c3cc4a1

File tree

3 files changed

+136
-153
lines changed

3 files changed

+136
-153
lines changed

pandas/_libs/include/pandas/parser/tokenizer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ See LICENSE for the license
1717
#define ERROR_NO_DIGITS 1
1818
#define ERROR_OVERFLOW 2
1919
#define ERROR_INVALID_CHARS 3
20+
#define ERROR_NO_MEMORY 4
2021

2122
#include <stdint.h>
2223

pandas/_libs/parsers.pyx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
149149
SKIP_LINE
150150
FINISHED
151151

152-
enum: ERROR_OVERFLOW
152+
enum: ERROR_OVERFLOW, ERROR_NO_MEMORY
153153

154154
ctypedef enum BadLineHandleMethod:
155155
ERROR,
@@ -1822,6 +1822,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
18221822
if error == ERROR_OVERFLOW:
18231823
# Can't get the word variable
18241824
raise OverflowError("Overflow")
1825+
if error == ERROR_NO_MEMORY:
1826+
raise MemoryError()
18251827
return None
18261828

18271829
if uint64_conflict(&state):
@@ -1892,6 +1894,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
18921894
if error == ERROR_OVERFLOW:
18931895
# Can't get the word variable
18941896
raise OverflowError("Overflow")
1897+
if error == ERROR_NO_MEMORY:
1898+
raise MemoryError()
18951899
return None, None
18961900

18971901
return result, na_count

pandas/_libs/src/parser/tokenizer.c

Lines changed: 130 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
2323
#include <float.h>
2424
#include <math.h>
2525
#include <stdbool.h>
26+
#include <stdlib.h>
2627

2728
#include "pandas/portable.h"
2829
#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
@@ -1834,201 +1835,178 @@ int uint64_conflict(uint_state *self) {
18341835
return self->seen_uint && (self->seen_sint || self->seen_null);
18351836
}
18361837

1838+
/**
1839+
* @brief Check if the character in the pointer indicates a number.
1840+
* It expects that you consumed all leading whitespace.
1841+
*
1842+
* @param p_item Pointer to verify
1843+
* @return Non-zero integer indicating that has a digit 0 otherwise.
1844+
*/
1845+
static inline int has_digit_int(const char *str) {
1846+
if (!str || *str == '\0') {
1847+
return 0;
1848+
}
1849+
1850+
switch (*str) {
1851+
case '0':
1852+
case '1':
1853+
case '2':
1854+
case '3':
1855+
case '4':
1856+
case '5':
1857+
case '6':
1858+
case '7':
1859+
case '8':
1860+
case '9':
1861+
return 1;
1862+
case '+':
1863+
case '-':
1864+
return isdigit_ascii(str[1]);
1865+
default:
1866+
return 0;
1867+
}
1868+
}
1869+
1870+
static inline int has_only_spaces(const char *str) {
1871+
while (*str != '\0' && isspace_ascii(*str)) {
1872+
str++;
1873+
}
1874+
return *str == '\0';
1875+
}
1876+
1877+
/* Copy a string without `char_to_remove`.
1878+
* The returned memory should be free-d with a call to `free`.
1879+
*/
1880+
static char *copy_string_without_char(const char *str, char char_to_remove) {
1881+
size_t chars_to_copy = 0;
1882+
for (const char *src = str; *src != '\0'; src++) {
1883+
if (*src != char_to_remove) {
1884+
chars_to_copy++;
1885+
}
1886+
}
1887+
1888+
char *start = malloc((chars_to_copy + 1) * sizeof(char));
1889+
if (!start) {
1890+
return NULL;
1891+
}
1892+
1893+
char *dst = start;
1894+
for (const char *src = str; *src != '\0'; src++) {
1895+
if (*src != char_to_remove) {
1896+
*dst++ = *src;
1897+
}
1898+
}
1899+
*dst = '\0';
1900+
1901+
return start;
1902+
}
1903+
18371904
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18381905
int *error, char tsep) {
1839-
const char *p = p_item;
1840-
// Skip leading spaces.
1841-
while (isspace_ascii(*p)) {
1842-
++p;
1906+
if (!p_item || *p_item == '\0') {
1907+
*error = ERROR_NO_DIGITS;
1908+
return 0;
18431909
}
18441910

1845-
// Handle sign.
1846-
const bool isneg = *p == '-' ? true : false;
1847-
// Handle sign.
1848-
if (isneg || (*p == '+')) {
1849-
p++;
1911+
while (isspace_ascii(*p_item)) {
1912+
++p_item;
18501913
}
18511914

1852-
// Check that there is a first digit.
1853-
if (!isdigit_ascii(*p)) {
1854-
// Error...
1915+
if (!has_digit_int(p_item)) {
18551916
*error = ERROR_NO_DIGITS;
18561917
return 0;
18571918
}
18581919

1859-
int64_t number = 0;
1860-
if (isneg) {
1861-
// If number is greater than pre_min, at least one more digit
1862-
// can be processed without overflowing.
1863-
int dig_pre_min = -(int_min % 10);
1864-
int64_t pre_min = int_min / 10;
1865-
1866-
// Process the digits.
1867-
char d = *p;
1868-
if (tsep != '\0') {
1869-
while (1) {
1870-
if (d == tsep) {
1871-
d = *++p;
1872-
continue;
1873-
} else if (!isdigit_ascii(d)) {
1874-
break;
1875-
}
1876-
if ((number > pre_min) ||
1877-
((number == pre_min) && (d - '0' <= dig_pre_min))) {
1878-
number = number * 10 - (d - '0');
1879-
d = *++p;
1880-
} else {
1881-
*error = ERROR_OVERFLOW;
1882-
return 0;
1883-
}
1884-
}
1885-
} else {
1886-
while (isdigit_ascii(d)) {
1887-
if ((number > pre_min) ||
1888-
((number == pre_min) && (d - '0' <= dig_pre_min))) {
1889-
number = number * 10 - (d - '0');
1890-
d = *++p;
1891-
} else {
1892-
*error = ERROR_OVERFLOW;
1893-
return 0;
1894-
}
1895-
}
1896-
}
1897-
} else {
1898-
// If number is less than pre_max, at least one more digit
1899-
// can be processed without overflowing.
1900-
int64_t pre_max = int_max / 10;
1901-
int dig_pre_max = int_max % 10;
1902-
1903-
// Process the digits.
1904-
char d = *p;
1905-
if (tsep != '\0') {
1906-
while (1) {
1907-
if (d == tsep) {
1908-
d = *++p;
1909-
continue;
1910-
} else if (!isdigit_ascii(d)) {
1911-
break;
1912-
}
1913-
if ((number < pre_max) ||
1914-
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1915-
number = number * 10 + (d - '0');
1916-
d = *++p;
1917-
1918-
} else {
1919-
*error = ERROR_OVERFLOW;
1920-
return 0;
1921-
}
1922-
}
1923-
} else {
1924-
while (isdigit_ascii(d)) {
1925-
if ((number < pre_max) ||
1926-
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1927-
number = number * 10 + (d - '0');
1928-
d = *++p;
1920+
char *processed_str = NULL;
19291921

1930-
} else {
1931-
*error = ERROR_OVERFLOW;
1932-
return 0;
1933-
}
1934-
}
1922+
if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
1923+
processed_str = copy_string_without_char(p_item, tsep);
1924+
if (!processed_str) {
1925+
*error = ERROR_NO_MEMORY;
1926+
return 0;
19351927
}
1928+
p_item = processed_str;
19361929
}
19371930

1938-
// Skip trailing spaces.
1939-
while (isspace_ascii(*p)) {
1940-
++p;
1941-
}
1931+
char *endptr = NULL;
1932+
errno = 0;
1933+
int64_t result = strtoll(p_item, &endptr, 10);
19421934

1943-
// Did we use up all the characters?
1944-
if (*p) {
1935+
if (!has_only_spaces(endptr)) {
1936+
// Check first for invalid characters because we may
1937+
// want to skip integer parsing if we find one.
19451938
*error = ERROR_INVALID_CHARS;
1946-
return 0;
1939+
result = 0;
1940+
} else if (errno == ERANGE || result > int_max || result < int_min) {
1941+
*error = ERROR_OVERFLOW;
1942+
result = 0;
1943+
} else {
1944+
*error = 0;
19471945
}
19481946

1949-
*error = 0;
1950-
return number;
1947+
// free processed_str that
1948+
// was either allocated due to the presence of tsep
1949+
// or is NULL
1950+
free(processed_str);
1951+
1952+
return result;
19511953
}
19521954

19531955
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
19541956
uint64_t uint_max, int *error, char tsep) {
1955-
const char *p = p_item;
1956-
// Skip leading spaces.
1957-
while (isspace_ascii(*p)) {
1958-
++p;
1957+
if (!p_item || *p_item == '\0') {
1958+
*error = ERROR_NO_DIGITS;
1959+
return 0;
19591960
}
19601961

1961-
// Handle sign.
1962-
if (*p == '-') {
1962+
while (isspace_ascii(*p_item)) {
1963+
++p_item;
1964+
}
1965+
1966+
if (*p_item == '-') {
19631967
state->seen_sint = 1;
19641968
*error = 0;
19651969
return 0;
1966-
} else if (*p == '+') {
1967-
p++;
1970+
} else if (*p_item == '+') {
1971+
p_item++;
19681972
}
19691973

19701974
// Check that there is a first digit.
1971-
if (!isdigit_ascii(*p)) {
1972-
// Error...
1975+
if (!isdigit_ascii(*p_item)) {
19731976
*error = ERROR_NO_DIGITS;
19741977
return 0;
19751978
}
19761979

1977-
// If number is less than pre_max, at least one more digit
1978-
// can be processed without overflowing.
1979-
//
1980-
// Process the digits.
1981-
uint64_t number = 0;
1982-
const uint64_t pre_max = uint_max / 10;
1983-
const uint64_t dig_pre_max = uint_max % 10;
1984-
char d = *p;
1985-
if (tsep != '\0') {
1986-
while (1) {
1987-
if (d == tsep) {
1988-
d = *++p;
1989-
continue;
1990-
} else if (!isdigit_ascii(d)) {
1991-
break;
1992-
}
1993-
if ((number < pre_max) ||
1994-
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
1995-
number = number * 10 + (d - '0');
1996-
d = *++p;
1980+
char *processed_str = NULL;
19971981

1998-
} else {
1999-
*error = ERROR_OVERFLOW;
2000-
return 0;
2001-
}
2002-
}
2003-
} else {
2004-
while (isdigit_ascii(d)) {
2005-
if ((number < pre_max) ||
2006-
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
2007-
number = number * 10 + (d - '0');
2008-
d = *++p;
2009-
2010-
} else {
2011-
*error = ERROR_OVERFLOW;
2012-
return 0;
2013-
}
1982+
if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
1983+
processed_str = copy_string_without_char(p_item, tsep);
1984+
if (!processed_str) {
1985+
*error = ERROR_NO_MEMORY;
1986+
return 0;
20141987
}
1988+
p_item = processed_str;
20151989
}
20161990

2017-
// Skip trailing spaces.
2018-
while (isspace_ascii(*p)) {
2019-
++p;
2020-
}
1991+
errno = 0;
1992+
char *endptr = NULL;
1993+
uint64_t result = strtoull(p_item, &endptr, 10);
20211994

2022-
// Did we use up all the characters?
2023-
if (*p) {
1995+
if (!has_only_spaces(endptr)) {
20241996
*error = ERROR_INVALID_CHARS;
2025-
return 0;
1997+
result = 0;
1998+
} else if (errno == ERANGE || result > uint_max) {
1999+
*error = ERROR_OVERFLOW;
2000+
result = 0;
2001+
} else {
2002+
*error = 0;
20262003
}
20272004

2028-
if (number > (uint64_t)int_max) {
2005+
if (result > (uint64_t)int_max) {
20292006
state->seen_uint = 1;
20302007
}
20312008

2032-
*error = 0;
2033-
return number;
2009+
free(processed_str);
2010+
2011+
return result;
20342012
}

0 commit comments

Comments
 (0)