@@ -23,6 +23,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
2323#include <float.h>
2424#include <math.h>
2525#include <stdbool.h>
26+ #include <stdlib.h>
2627
2728#include "pandas/portable.h"
2829#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
@@ -1834,201 +1835,178 @@ int uint64_conflict(uint_state *self) {
18341835 return self -> seen_uint && (self -> seen_sint || self -> seen_null );
18351836}
18361837
1838+ /**
1839+ * @brief Check if the character in the pointer indicates a number.
1840+ * It expects that you consumed all leading whitespace.
1841+ *
1842+ * @param p_item Pointer to verify
1843+ * @return Non-zero integer indicating that has a digit 0 otherwise.
1844+ */
1845+ static inline int has_digit_int (const char * str ) {
1846+ if (!str || * str == '\0' ) {
1847+ return 0 ;
1848+ }
1849+
1850+ switch (* str ) {
1851+ case '0' :
1852+ case '1' :
1853+ case '2' :
1854+ case '3' :
1855+ case '4' :
1856+ case '5' :
1857+ case '6' :
1858+ case '7' :
1859+ case '8' :
1860+ case '9' :
1861+ return 1 ;
1862+ case '+' :
1863+ case '-' :
1864+ return isdigit_ascii (str [1 ]);
1865+ default :
1866+ return 0 ;
1867+ }
1868+ }
1869+
1870+ static inline int has_only_spaces (const char * str ) {
1871+ while (* str != '\0' && isspace_ascii (* str )) {
1872+ str ++ ;
1873+ }
1874+ return * str == '\0' ;
1875+ }
1876+
1877+ /* Copy a string without `char_to_remove`.
1878+ * The returned memory should be free-d with a call to `free`.
1879+ */
1880+ static char * copy_string_without_char (const char * str , char char_to_remove ) {
1881+ size_t chars_to_copy = 0 ;
1882+ for (const char * src = str ; * src != '\0' ; src ++ ) {
1883+ if (* src != char_to_remove ) {
1884+ chars_to_copy ++ ;
1885+ }
1886+ }
1887+
1888+ char * start = malloc ((chars_to_copy + 1 ) * sizeof (char ));
1889+ if (!start ) {
1890+ return NULL ;
1891+ }
1892+
1893+ char * dst = start ;
1894+ for (const char * src = str ; * src != '\0' ; src ++ ) {
1895+ if (* src != char_to_remove ) {
1896+ * dst ++ = * src ;
1897+ }
1898+ }
1899+ * dst = '\0' ;
1900+
1901+ return start ;
1902+ }
1903+
18371904int64_t str_to_int64 (const char * p_item , int64_t int_min , int64_t int_max ,
18381905 int * error , char tsep ) {
1839- const char * p = p_item ;
1840- // Skip leading spaces.
1841- while (isspace_ascii (* p )) {
1842- ++ p ;
1906+ if (!p_item || * p_item == '\0' ) {
1907+ * error = ERROR_NO_DIGITS ;
1908+ return 0 ;
18431909 }
18441910
1845- // Handle sign.
1846- const bool isneg = * p == '-' ? true : false;
1847- // Handle sign.
1848- if (isneg || (* p == '+' )) {
1849- p ++ ;
1911+ while (isspace_ascii (* p_item )) {
1912+ ++ p_item ;
18501913 }
18511914
1852- // Check that there is a first digit.
1853- if (!isdigit_ascii (* p )) {
1854- // Error...
1915+ if (!has_digit_int (p_item )) {
18551916 * error = ERROR_NO_DIGITS ;
18561917 return 0 ;
18571918 }
18581919
1859- int64_t number = 0 ;
1860- if (isneg ) {
1861- // If number is greater than pre_min, at least one more digit
1862- // can be processed without overflowing.
1863- int dig_pre_min = - (int_min % 10 );
1864- int64_t pre_min = int_min / 10 ;
1865-
1866- // Process the digits.
1867- char d = * p ;
1868- if (tsep != '\0' ) {
1869- while (1 ) {
1870- if (d == tsep ) {
1871- d = * ++ p ;
1872- continue ;
1873- } else if (!isdigit_ascii (d )) {
1874- break ;
1875- }
1876- if ((number > pre_min ) ||
1877- ((number == pre_min ) && (d - '0' <= dig_pre_min ))) {
1878- number = number * 10 - (d - '0' );
1879- d = * ++ p ;
1880- } else {
1881- * error = ERROR_OVERFLOW ;
1882- return 0 ;
1883- }
1884- }
1885- } else {
1886- while (isdigit_ascii (d )) {
1887- if ((number > pre_min ) ||
1888- ((number == pre_min ) && (d - '0' <= dig_pre_min ))) {
1889- number = number * 10 - (d - '0' );
1890- d = * ++ p ;
1891- } else {
1892- * error = ERROR_OVERFLOW ;
1893- return 0 ;
1894- }
1895- }
1896- }
1897- } else {
1898- // If number is less than pre_max, at least one more digit
1899- // can be processed without overflowing.
1900- int64_t pre_max = int_max / 10 ;
1901- int dig_pre_max = int_max % 10 ;
1902-
1903- // Process the digits.
1904- char d = * p ;
1905- if (tsep != '\0' ) {
1906- while (1 ) {
1907- if (d == tsep ) {
1908- d = * ++ p ;
1909- continue ;
1910- } else if (!isdigit_ascii (d )) {
1911- break ;
1912- }
1913- if ((number < pre_max ) ||
1914- ((number == pre_max ) && (d - '0' <= dig_pre_max ))) {
1915- number = number * 10 + (d - '0' );
1916- d = * ++ p ;
1917-
1918- } else {
1919- * error = ERROR_OVERFLOW ;
1920- return 0 ;
1921- }
1922- }
1923- } else {
1924- while (isdigit_ascii (d )) {
1925- if ((number < pre_max ) ||
1926- ((number == pre_max ) && (d - '0' <= dig_pre_max ))) {
1927- number = number * 10 + (d - '0' );
1928- d = * ++ p ;
1920+ char * processed_str = NULL ;
19291921
1930- } else {
1931- * error = ERROR_OVERFLOW ;
1932- return 0 ;
1933- }
1934- }
1922+ if ( tsep != '\0' && strchr ( p_item , tsep ) != NULL ) {
1923+ processed_str = copy_string_without_char ( p_item , tsep ) ;
1924+ if (! processed_str ) {
1925+ * error = ERROR_NO_MEMORY ;
1926+ return 0 ;
19351927 }
1928+ p_item = processed_str ;
19361929 }
19371930
1938- // Skip trailing spaces.
1939- while (isspace_ascii (* p )) {
1940- ++ p ;
1941- }
1931+ char * endptr = NULL ;
1932+ errno = 0 ;
1933+ int64_t result = strtoll (p_item , & endptr , 10 );
19421934
1943- // Did we use up all the characters?
1944- if (* p ) {
1935+ if (!has_only_spaces (endptr )) {
1936+ // Check first for invalid characters because we may
1937+ // want to skip integer parsing if we find one.
19451938 * error = ERROR_INVALID_CHARS ;
1946- return 0 ;
1939+ result = 0 ;
1940+ } else if (errno == ERANGE || result > int_max || result < int_min ) {
1941+ * error = ERROR_OVERFLOW ;
1942+ result = 0 ;
1943+ } else {
1944+ * error = 0 ;
19471945 }
19481946
1949- * error = 0 ;
1950- return number ;
1947+ // free processed_str that
1948+ // was either allocated due to the presence of tsep
1949+ // or is NULL
1950+ free (processed_str );
1951+
1952+ return result ;
19511953}
19521954
19531955uint64_t str_to_uint64 (uint_state * state , const char * p_item , int64_t int_max ,
19541956 uint64_t uint_max , int * error , char tsep ) {
1955- const char * p = p_item ;
1956- // Skip leading spaces.
1957- while (isspace_ascii (* p )) {
1958- ++ p ;
1957+ if (!p_item || * p_item == '\0' ) {
1958+ * error = ERROR_NO_DIGITS ;
1959+ return 0 ;
19591960 }
19601961
1961- // Handle sign.
1962- if (* p == '-' ) {
1962+ while (isspace_ascii (* p_item )) {
1963+ ++ p_item ;
1964+ }
1965+
1966+ if (* p_item == '-' ) {
19631967 state -> seen_sint = 1 ;
19641968 * error = 0 ;
19651969 return 0 ;
1966- } else if (* p == '+' ) {
1967- p ++ ;
1970+ } else if (* p_item == '+' ) {
1971+ p_item ++ ;
19681972 }
19691973
19701974 // Check that there is a first digit.
1971- if (!isdigit_ascii (* p )) {
1972- // Error...
1975+ if (!isdigit_ascii (* p_item )) {
19731976 * error = ERROR_NO_DIGITS ;
19741977 return 0 ;
19751978 }
19761979
1977- // If number is less than pre_max, at least one more digit
1978- // can be processed without overflowing.
1979- //
1980- // Process the digits.
1981- uint64_t number = 0 ;
1982- const uint64_t pre_max = uint_max / 10 ;
1983- const uint64_t dig_pre_max = uint_max % 10 ;
1984- char d = * p ;
1985- if (tsep != '\0' ) {
1986- while (1 ) {
1987- if (d == tsep ) {
1988- d = * ++ p ;
1989- continue ;
1990- } else if (!isdigit_ascii (d )) {
1991- break ;
1992- }
1993- if ((number < pre_max ) ||
1994- ((number == pre_max ) && ((uint64_t )(d - '0' ) <= dig_pre_max ))) {
1995- number = number * 10 + (d - '0' );
1996- d = * ++ p ;
1980+ char * processed_str = NULL ;
19971981
1998- } else {
1999- * error = ERROR_OVERFLOW ;
2000- return 0 ;
2001- }
2002- }
2003- } else {
2004- while (isdigit_ascii (d )) {
2005- if ((number < pre_max ) ||
2006- ((number == pre_max ) && ((uint64_t )(d - '0' ) <= dig_pre_max ))) {
2007- number = number * 10 + (d - '0' );
2008- d = * ++ p ;
2009-
2010- } else {
2011- * error = ERROR_OVERFLOW ;
2012- return 0 ;
2013- }
1982+ if (tsep != '\0' && strchr (p_item , tsep ) != NULL ) {
1983+ processed_str = copy_string_without_char (p_item , tsep );
1984+ if (!processed_str ) {
1985+ * error = ERROR_NO_MEMORY ;
1986+ return 0 ;
20141987 }
1988+ p_item = processed_str ;
20151989 }
20161990
2017- // Skip trailing spaces.
2018- while (isspace_ascii (* p )) {
2019- ++ p ;
2020- }
1991+ errno = 0 ;
1992+ char * endptr = NULL ;
1993+ uint64_t result = strtoull (p_item , & endptr , 10 );
20211994
2022- // Did we use up all the characters?
2023- if (* p ) {
1995+ if (!has_only_spaces (endptr )) {
20241996 * error = ERROR_INVALID_CHARS ;
2025- return 0 ;
1997+ result = 0 ;
1998+ } else if (errno == ERANGE || result > uint_max ) {
1999+ * error = ERROR_OVERFLOW ;
2000+ result = 0 ;
2001+ } else {
2002+ * error = 0 ;
20262003 }
20272004
2028- if (number > (uint64_t )int_max ) {
2005+ if (result > (uint64_t )int_max ) {
20292006 state -> seen_uint = 1 ;
20302007 }
20312008
2032- * error = 0 ;
2033- return number ;
2009+ free (processed_str );
2010+
2011+ return result ;
20342012}
0 commit comments