Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -423,11 +423,11 @@ zlib
Optimizations
=============

module_name
-----------

* TODO
csv
---

* The :func:`csv.reader` has been optimized, and is around 2x faster.
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`138214`.)


Deprecated
Expand Down
30 changes: 30 additions & 0 deletions Lib/test/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ def test_read_oddinputs(self):
self.assertRaises(csv.Error, self._read_test,
['"ab"c'], None, strict = 1)
self._read_test(['"ab"c'], [['abc']], doublequote = 0)
self._read_test([",,,"], [["", "", "", ""]])

self.assertRaises(csv.Error, self._read_test,
[b'abc'], None)
Expand Down Expand Up @@ -423,6 +424,10 @@ def test_read_escape(self):
self._read_test(['a,\0b,c'], [['a', 'b', 'c']], escapechar='\0')
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar=None)
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']])
# '"abc\" with escapechar='\' -> ESCAPE_IN_QUOTED_FIELD + EOL -> '\n' appended
self._read_test(['"abc\\'], [["abc\n"]], escapechar="\\")
with self.assertRaises(csv.Error):
self._read_test(['"abc\\'], None, escapechar="\\", strict=True)

def test_read_quoting(self):
self._read_test(['1,",3,",5'], [['1', ',3,', '5']])
Expand Down Expand Up @@ -513,6 +518,31 @@ def test_read_linenum(self):
self.assertRaises(StopIteration, next, r)
self.assertEqual(r.line_num, 3)

def test_read_linenum_multiline_record(self):
r = csv.reader(['"a', 'b",c', "d,e"])
self.assertEqual(next(r), ["ab", "c"])
self.assertEqual(r.line_num, 2)
self.assertEqual(next(r), ["d", "e"])
self.assertEqual(r.line_num, 3)
with self.assertRaises(StopIteration):
next(r)
self.assertEqual(r.line_num, 3)

def test_read_with_unicode_delimiter_and_quotechar(self):
self._read_test(["αλβλγ"], [["α", "β", "γ"]], delimiter="λ")
self._read_test(
["אαאλאβאλאγא"], [["α", "β", "γ"]], delimiter="λ", quotechar="א"
)

# non-BMP
delim, quote = "😂", "😺"
self._read_test(
[f"{quote}a{quote}{delim}{quote}b{quote}"],
[["a", "b"]],
delimiter=delim,
quotechar=quote,
)

def test_roundtrip_quoteed_newlines(self):
rows = [
['\na', 'b\nc', 'd\n'],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Speed up :class:`~csv.reader` by 2x.
129 changes: 118 additions & 11 deletions Modules/_csv.c
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,45 @@ parse_add_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
return 0;
}

static int
parse_add_substring(ReaderObj *self, _csvstate *module_state,
PyObject* lineobj, Py_ssize_t start, Py_ssize_t end)
{
int kind;
const void *data;
Py_UCS4 *dest;
Py_ssize_t field_limit;

Py_ssize_t len = end - start;
if (len <= 0) {
return 0;
}

field_limit = FT_ATOMIC_LOAD_SSIZE_RELAXED(module_state->field_limit);
if (self->field_len + len > field_limit) {
PyErr_Format(module_state->error_obj,
"field larger than field limit (%zd)",
field_limit);
return -1;
}

while (self->field_len + len > self->field_size) {
if (!parse_grow_buff(self))
return -1;
}

kind = PyUnicode_KIND(lineobj);
data = PyUnicode_DATA(lineobj);
dest = self->field + self->field_len;

for (Py_ssize_t i = 0; i < len; ++i) {
dest[i] = PyUnicode_READ(kind, data, start + i);
}

self->field_len += len;
return 0;
}

static int
parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
{
Expand Down Expand Up @@ -924,11 +963,33 @@ Reader_iternext(PyObject *op)

PyObject *fields = NULL;
Py_UCS4 c;
Py_ssize_t pos, linelen;
int kind;
const void *data;
Py_ssize_t pos, linelen, chunk_end, p;
PyObject *lineobj;

#define FIND_AND_UPDATE_CHUNK_END(c) \
do \
{ \
p = PyUnicode_FindChar(lineobj, (c), pos, linelen, 1); \
if (p == -2) { \
Py_DECREF(lineobj); \
goto err; \
} \
if (p >= 0 && p < chunk_end) { \
chunk_end = p; \
} \
} while (0)

#define PROCESS_CHAR_AND_ADVANCE() \
do \
{ \
c = PyUnicode_READ_CHAR(lineobj, pos); \
if (parse_process_char(self, module_state, c) < 0) { \
Py_DECREF(lineobj); \
goto err; \
} \
pos++; \
} while (0)

_csvstate *module_state = _csv_state_from_type(Py_TYPE(self),
"Reader.__next__");
if (module_state == NULL) {
Expand Down Expand Up @@ -962,17 +1023,61 @@ Reader_iternext(PyObject *op)
return NULL;
}
++self->line_num;
kind = PyUnicode_KIND(lineobj);
data = PyUnicode_DATA(lineobj);
pos = 0;
linelen = PyUnicode_GET_LENGTH(lineobj);
while (linelen--) {
c = PyUnicode_READ(kind, data, pos);
if (parse_process_char(self, module_state, c) < 0) {
Py_DECREF(lineobj);
goto err;

while (pos < linelen) {
/* For IN_FIELD and IN_QUOTED_FIELD states, optimize by finding
* chunks of characters that can be processed together up to the
* next special character (eg: delimiter, quote, escape).
*/
switch (self->state) {
case IN_FIELD:
chunk_end = linelen;

FIND_AND_UPDATE_CHUNK_END(self->dialect->delimiter);
if (self->dialect->escapechar != NOT_SET) {
FIND_AND_UPDATE_CHUNK_END(self->dialect->escapechar);
}
FIND_AND_UPDATE_CHUNK_END('\n');
FIND_AND_UPDATE_CHUNK_END('\r');

if (chunk_end > pos) {
if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) {
Py_DECREF(lineobj);
goto err;
}
}
pos = chunk_end;

if (pos < linelen) {
PROCESS_CHAR_AND_ADVANCE();
}
break;
case IN_QUOTED_FIELD:
chunk_end = linelen;

FIND_AND_UPDATE_CHUNK_END(self->dialect->quotechar);
if (self->dialect->escapechar != NOT_SET) {
FIND_AND_UPDATE_CHUNK_END(self->dialect->escapechar);
}

if (chunk_end > pos) {
if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) {
Py_DECREF(lineobj);
goto err;
}
}
pos = chunk_end;

if (pos < linelen) {
PROCESS_CHAR_AND_ADVANCE();
}
break;
default:
PROCESS_CHAR_AND_ADVANCE();
break;
}
pos++;
}
Py_DECREF(lineobj);
if (parse_process_char(self, module_state, EOL) < 0)
Expand All @@ -983,6 +1088,8 @@ Reader_iternext(PyObject *op)
self->fields = NULL;
err:
return fields;
#undef PROCESS_CHAR_AND_ADVANCE
#undef FIND_AND_UPDATE_CHUNK_END
}

static void
Expand Down
Loading