Skip to content

Commit 2bea3c2

Browse files
committed
perf: write in chunks
1 parent 2287944 commit 2bea3c2

File tree

1 file changed

+29
-12
lines changed

1 file changed

+29
-12
lines changed

pandas/_libs/src/parser/tokenizer.c

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1877,22 +1877,37 @@ static inline bool has_only_spaces(const char *str) {
18771877
}
18781878

18791879
/* Copy a string without `char_to_remove` into `output`,
1880-
* while ensuring it's null terminated.
1880+
* it assumes that output is filled with `\0`,
1881+
* so it won't null terminate the result.
18811882
*/
18821883
static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
18831884
const char *str, char char_to_remove) {
1884-
size_t i = 0;
1885-
for (const char *src = str; *src != '\0' && i < PROCESSED_WORD_CAPACITY;
1886-
src++) {
1887-
if (*src != char_to_remove) {
1888-
output[i++] = *src;
1885+
char *dst = output;
1886+
const char *src = str;
1887+
// last character is reserved for null terminator.
1888+
const char *end = output + PROCESSED_WORD_CAPACITY - 1;
1889+
1890+
while (*src != '\0' && dst < end) {
1891+
const char *next = src;
1892+
// find EOS or char_to_remove
1893+
while (*next != '\0' && *next != char_to_remove) {
1894+
next++;
18891895
}
1890-
}
1891-
if (i < PROCESSED_WORD_CAPACITY) {
1892-
output[i] = '\0';
1893-
} else {
1894-
// str is too big, probably would overflow
1895-
errno = ERANGE;
1896+
1897+
size_t len = next - src;
1898+
if (dst + len > end) {
1899+
// Can't write here, str is too big
1900+
errno = ERANGE;
1901+
return;
1902+
}
1903+
1904+
// copy block
1905+
memcpy(dst, src, len);
1906+
1907+
// go to next available location to write
1908+
dst += len;
1909+
// Move past char to remove
1910+
src = *next == char_to_remove ? next + 1 : next;
18961911
}
18971912
}
18981913

@@ -1915,6 +1930,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
19151930
errno = 0;
19161931
if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
19171932
char buffer[PROCESSED_WORD_CAPACITY];
1933+
memset(buffer, '\0', sizeof(buffer));
19181934
copy_string_without_char(buffer, p_item, tsep);
19191935
p_item = buffer;
19201936
}
@@ -1970,6 +1986,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
19701986
errno = 0;
19711987
if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
19721988
char buffer[PROCESSED_WORD_CAPACITY];
1989+
memset(buffer, '\0', sizeof(buffer));
19731990
copy_string_without_char(buffer, p_item, tsep);
19741991
p_item = buffer;
19751992
}

0 commit comments

Comments
 (0)