Skip to content

Commit 2459313

Browse files
committed
perf: use a local buffer to store the processed string
1 parent c3cc4a1 commit 2459313

File tree

3 files changed

+33
-51
lines changed

3 files changed

+33
-51
lines changed

pandas/_libs/include/pandas/parser/tokenizer.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ See LICENSE for the license
1717
#define ERROR_NO_DIGITS 1
1818
#define ERROR_OVERFLOW 2
1919
#define ERROR_INVALID_CHARS 3
20-
#define ERROR_NO_MEMORY 4
2120

2221
#include <stdint.h>
2322

pandas/_libs/parsers.pyx

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
149149
SKIP_LINE
150150
FINISHED
151151

152-
enum: ERROR_OVERFLOW, ERROR_NO_MEMORY
152+
enum: ERROR_OVERFLOW
153153

154154
ctypedef enum BadLineHandleMethod:
155155
ERROR,
@@ -1822,8 +1822,6 @@ cdef _try_uint64(parser_t *parser, int64_t col,
18221822
if error == ERROR_OVERFLOW:
18231823
# Can't get the word variable
18241824
raise OverflowError("Overflow")
1825-
if error == ERROR_NO_MEMORY:
1826-
raise MemoryError()
18271825
return None
18281826

18291827
if uint64_conflict(&state):
@@ -1894,8 +1892,6 @@ cdef _try_int64(parser_t *parser, int64_t col,
18941892
if error == ERROR_OVERFLOW:
18951893
# Can't get the word variable
18961894
raise OverflowError("Overflow")
1897-
if error == ERROR_NO_MEMORY:
1898-
raise MemoryError()
18991895
return None, None
19001896

19011897
return result, na_count

pandas/_libs/src/parser/tokenizer.c

Lines changed: 32 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
2828
#include "pandas/portable.h"
2929
#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
3030

31+
static const int PROCESSED_WORD_CAPACITY = 128;
32+
3133
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
3234
int64_t start) {
3335
// column i, starting at 0
@@ -1874,31 +1876,23 @@ static inline int has_only_spaces(const char *str) {
18741876
return *str == '\0';
18751877
}
18761878

1877-
/* Copy a string without `char_to_remove`.
1878-
* The returned memory should be free-d with a call to `free`.
1879+
/* Copy a string without `char_to_remove` into `output`,
1880+
* while ensuring it's null terminated.
18791881
*/
1880-
static char *copy_string_without_char(const char *str, char char_to_remove) {
1881-
size_t chars_to_copy = 0;
1882-
for (const char *src = str; *src != '\0'; src++) {
1882+
static void copy_string_without_char(char *output, const char *str,
1883+
char char_to_remove, size_t output_size) {
1884+
size_t i = 0;
1885+
for (const char *src = str; *src != '\0' && i < output_size; src++) {
18831886
if (*src != char_to_remove) {
1884-
chars_to_copy++;
1887+
output[i++] = *src;
18851888
}
18861889
}
1887-
1888-
char *start = malloc((chars_to_copy + 1) * sizeof(char));
1889-
if (!start) {
1890-
return NULL;
1891-
}
1892-
1893-
char *dst = start;
1894-
for (const char *src = str; *src != '\0'; src++) {
1895-
if (*src != char_to_remove) {
1896-
*dst++ = *src;
1897-
}
1890+
if (i < output_size) {
1891+
output[i] = '\0';
1892+
} else {
1893+
// str is too big, probably would overflow
1894+
errno = ERANGE;
18981895
}
1899-
*dst = '\0';
1900-
1901-
return start;
19021896
}
19031897

19041898
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
@@ -1917,19 +1911,19 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
19171911
return 0;
19181912
}
19191913

1920-
char *processed_str = NULL;
1921-
1914+
errno = 0;
19221915
if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
1923-
processed_str = copy_string_without_char(p_item, tsep);
1924-
if (!processed_str) {
1925-
*error = ERROR_NO_MEMORY;
1926-
return 0;
1927-
}
1928-
p_item = processed_str;
1916+
char buffer[PROCESSED_WORD_CAPACITY];
1917+
copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY);
1918+
p_item = buffer;
1919+
}
1920+
1921+
if (errno == ERANGE) {
1922+
*error = ERROR_OVERFLOW;
1923+
return 0;
19291924
}
19301925

19311926
char *endptr = NULL;
1932-
errno = 0;
19331927
int64_t result = strtoll(p_item, &endptr, 10);
19341928

19351929
if (!has_only_spaces(endptr)) {
@@ -1944,11 +1938,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
19441938
*error = 0;
19451939
}
19461940

1947-
// free processed_str that
1948-
// was either allocated due to the presence of tsep
1949-
// or is NULL
1950-
free(processed_str);
1951-
19521941
return result;
19531942
}
19541943

@@ -1977,18 +1966,18 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
19771966
return 0;
19781967
}
19791968

1980-
char *processed_str = NULL;
1981-
1969+
errno = 0;
19821970
if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
1983-
processed_str = copy_string_without_char(p_item, tsep);
1984-
if (!processed_str) {
1985-
*error = ERROR_NO_MEMORY;
1986-
return 0;
1987-
}
1988-
p_item = processed_str;
1971+
char buffer[PROCESSED_WORD_CAPACITY];
1972+
copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY);
1973+
p_item = buffer;
1974+
}
1975+
1976+
if (errno == ERANGE) {
1977+
*error = ERROR_OVERFLOW;
1978+
return 0;
19891979
}
19901980

1991-
errno = 0;
19921981
char *endptr = NULL;
19931982
uint64_t result = strtoull(p_item, &endptr, 10);
19941983

@@ -2006,7 +1995,5 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
20061995
state->seen_uint = 1;
20071996
}
20081997

2009-
free(processed_str);
2010-
20111998
return result;
20121999
}

0 commit comments

Comments
 (0)