diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 43c40e4d0f3154..333f9b1f6d20e0 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -423,11 +423,11 @@ zlib Optimizations ============= -module_name ------------ - -* TODO +csv +--- +* The :func:`csv.reader` has been optimized, and is around 2x faster. + (Contributed by Maurycy Pawłowski-Wieroński in :gh:`138214`.) Deprecated diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 60feab225a107c..55ab9b6c9eb6c0 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -373,6 +373,7 @@ def test_read_oddinputs(self): self.assertRaises(csv.Error, self._read_test, ['"ab"c'], None, strict = 1) self._read_test(['"ab"c'], [['abc']], doublequote = 0) + self._read_test([",,,"], [["", "", "", ""]]) self.assertRaises(csv.Error, self._read_test, [b'abc'], None) @@ -423,6 +424,10 @@ def test_read_escape(self): self._read_test(['a,\0b,c'], [['a', 'b', 'c']], escapechar='\0') self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar=None) self._read_test(['a,\\b,c'], [['a', '\\b', 'c']]) + # '"abc\" with escapechar='\' -> ESCAPE_IN_QUOTED_FIELD + EOL -> '\n' appended + self._read_test(['"abc\\'], [["abc\n"]], escapechar="\\") + with self.assertRaises(csv.Error): + self._read_test(['"abc\\'], None, escapechar="\\", strict=True) def test_read_quoting(self): self._read_test(['1,",3,",5'], [['1', ',3,', '5']]) @@ -513,6 +518,31 @@ def test_read_linenum(self): self.assertRaises(StopIteration, next, r) self.assertEqual(r.line_num, 3) + def test_read_linenum_multiline_record(self): + r = csv.reader(['"a', 'b",c', "d,e"]) + self.assertEqual(next(r), ["ab", "c"]) + self.assertEqual(r.line_num, 2) + self.assertEqual(next(r), ["d", "e"]) + self.assertEqual(r.line_num, 3) + with self.assertRaises(StopIteration): + next(r) + self.assertEqual(r.line_num, 3) + + def test_read_with_unicode_delimiter_and_quotechar(self): + self._read_test(["αλβλγ"], [["α", "β", "γ"]], delimiter="λ") + self._read_test( + ["אαאλאβאλאγא"], [["α", "β", "γ"]], delimiter="λ", quotechar="א" + ) + + # non-BMP + delim, quote = "😂", "😺" + self._read_test( + [f"{quote}a{quote}{delim}{quote}b{quote}"], + [["a", "b"]], + delimiter=delim, + quotechar=quote, + ) + def test_roundtrip_quoteed_newlines(self): rows = [ ['\na', 'b\nc', 'd\n'], diff --git a/Misc/NEWS.d/next/Library/2025-08-28-02-41-14.gh-issue-138213.8m2OO9.rst b/Misc/NEWS.d/next/Library/2025-08-28-02-41-14.gh-issue-138213.8m2OO9.rst new file mode 100644 index 00000000000000..e2b226b90ae447 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-08-28-02-41-14.gh-issue-138213.8m2OO9.rst @@ -0,0 +1 @@ +Speed up :class:`~csv.reader` by 2x. diff --git a/Modules/_csv.c b/Modules/_csv.c index 2e04136e0ac657..e21d2eeab83fde 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -722,6 +722,45 @@ parse_add_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) return 0; } +static int +parse_add_substring(ReaderObj *self, _csvstate *module_state, + PyObject* lineobj, Py_ssize_t start, Py_ssize_t end) +{ + int kind; + const void *data; + Py_UCS4 *dest; + Py_ssize_t field_limit; + + Py_ssize_t len = end - start; + if (len <= 0) { + return 0; + } + + field_limit = FT_ATOMIC_LOAD_SSIZE_RELAXED(module_state->field_limit); + if (self->field_len + len > field_limit) { + PyErr_Format(module_state->error_obj, + "field larger than field limit (%zd)", + field_limit); + return -1; + } + + while (self->field_len + len > self->field_size) { + if (!parse_grow_buff(self)) + return -1; + } + + kind = PyUnicode_KIND(lineobj); + data = PyUnicode_DATA(lineobj); + dest = self->field + self->field_len; + + for (Py_ssize_t i = 0; i < len; ++i) { + dest[i] = PyUnicode_READ(kind, data, start + i); + } + + self->field_len += len; + return 0; +} + static int parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) { @@ -924,11 +963,33 @@ Reader_iternext(PyObject *op) PyObject *fields = NULL; Py_UCS4 c; - Py_ssize_t pos, linelen; - int kind; - const void *data; + Py_ssize_t pos, linelen, chunk_end, p; PyObject *lineobj; +#define FIND_AND_UPDATE_CHUNK_END(c) \ + do \ + { \ + p = PyUnicode_FindChar(lineobj, (c), pos, linelen, 1); \ + if (p == -2) { \ + Py_DECREF(lineobj); \ + goto err; \ + } \ + if (p >= 0 && p < chunk_end) { \ + chunk_end = p; \ + } \ + } while (0) + +#define PROCESS_CHAR_AND_ADVANCE() \ + do \ + { \ + c = PyUnicode_READ_CHAR(lineobj, pos); \ + if (parse_process_char(self, module_state, c) < 0) { \ + Py_DECREF(lineobj); \ + goto err; \ + } \ + pos++; \ + } while (0) + _csvstate *module_state = _csv_state_from_type(Py_TYPE(self), "Reader.__next__"); if (module_state == NULL) { @@ -962,17 +1023,61 @@ Reader_iternext(PyObject *op) return NULL; } ++self->line_num; - kind = PyUnicode_KIND(lineobj); - data = PyUnicode_DATA(lineobj); pos = 0; linelen = PyUnicode_GET_LENGTH(lineobj); - while (linelen--) { - c = PyUnicode_READ(kind, data, pos); - if (parse_process_char(self, module_state, c) < 0) { - Py_DECREF(lineobj); - goto err; + + while (pos < linelen) { + /* For IN_FIELD and IN_QUOTED_FIELD states, optimize by finding + * chunks of characters that can be processed together up to the + * next special character (eg: delimiter, quote, escape). + */ + switch (self->state) { + case IN_FIELD: + chunk_end = linelen; + + FIND_AND_UPDATE_CHUNK_END(self->dialect->delimiter); + if (self->dialect->escapechar != NOT_SET) { + FIND_AND_UPDATE_CHUNK_END(self->dialect->escapechar); + } + FIND_AND_UPDATE_CHUNK_END('\n'); + FIND_AND_UPDATE_CHUNK_END('\r'); + + if (chunk_end > pos) { + if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) { + Py_DECREF(lineobj); + goto err; + } + } + pos = chunk_end; + + if (pos < linelen) { + PROCESS_CHAR_AND_ADVANCE(); + } + break; + case IN_QUOTED_FIELD: + chunk_end = linelen; + + FIND_AND_UPDATE_CHUNK_END(self->dialect->quotechar); + if (self->dialect->escapechar != NOT_SET) { + FIND_AND_UPDATE_CHUNK_END(self->dialect->escapechar); + } + + if (chunk_end > pos) { + if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) { + Py_DECREF(lineobj); + goto err; + } + } + pos = chunk_end; + + if (pos < linelen) { + PROCESS_CHAR_AND_ADVANCE(); + } + break; + default: + PROCESS_CHAR_AND_ADVANCE(); + break; } - pos++; } Py_DECREF(lineobj); if (parse_process_char(self, module_state, EOL) < 0) @@ -983,6 +1088,8 @@ Reader_iternext(PyObject *op) self->fields = NULL; err: return fields; +#undef PROCESS_CHAR_AND_ADVANCE +#undef FIND_AND_UPDATE_CHUNK_END } static void