Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -423,11 +423,11 @@ zlib
Optimizations
=============

module_name
-----------

* TODO
csv
---

* The :meth:`csv.reader` has been optimized, and is around 1.4x faster.
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`138214`.)


Deprecated
Expand Down
30 changes: 30 additions & 0 deletions Lib/test/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ def test_read_oddinputs(self):
self.assertRaises(csv.Error, self._read_test,
['"ab"c'], None, strict = 1)
self._read_test(['"ab"c'], [['abc']], doublequote = 0)
self._read_test([",,,"], [["", "", "", ""]])

self.assertRaises(csv.Error, self._read_test,
[b'abc'], None)
Expand Down Expand Up @@ -423,6 +424,10 @@ def test_read_escape(self):
self._read_test(['a,\0b,c'], [['a', 'b', 'c']], escapechar='\0')
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar=None)
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']])
# '"abc\" with escapechar='\' -> ESCAPE_IN_QUOTED_FIELD + EOL -> '\n' appended
self._read_test(['"abc\\'], [["abc\n"]], escapechar="\\")
with self.assertRaises(csv.Error):
self._read_test(['"abc\\'], None, escapechar="\\", strict=True)

def test_read_quoting(self):
self._read_test(['1,",3,",5'], [['1', ',3,', '5']])
Expand Down Expand Up @@ -513,6 +518,31 @@ def test_read_linenum(self):
self.assertRaises(StopIteration, next, r)
self.assertEqual(r.line_num, 3)

def test_read_linenum_multiline_record(self):
r = csv.reader(['"a', 'b",c', "d,e"])
self.assertEqual(next(r), ["ab", "c"])
self.assertEqual(r.line_num, 2)
self.assertEqual(next(r), ["d", "e"])
self.assertEqual(r.line_num, 3)
with self.assertRaises(StopIteration):
next(r)
self.assertEqual(r.line_num, 3)

def test_read_with_unicode_delimiter_and_quotechar(self):
self._read_test(["αλβλγ"], [["α", "β", "γ"]], delimiter="λ")
self._read_test(
["אαאλאβאλאγא"], [["α", "β", "γ"]], delimiter="λ", quotechar="א"
)

# non-BMP
delim, quote = "😂", "😺"
self._read_test(
[f"{quote}a{quote}{delim}{quote}b{quote}"],
[["a", "b"]],
delimiter=delim,
quotechar=quote,
)

def test_roundtrip_quoteed_newlines(self):
rows = [
['\na', 'b\nc', 'd\n'],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Speed up :class:`~csv.reader` by 1.4x.
161 changes: 147 additions & 14 deletions Modules/_csv.c
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,45 @@ parse_add_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
return 0;
}

static int
parse_add_substring(ReaderObj *self, _csvstate *module_state,
PyObject* lineobj, Py_ssize_t start, Py_ssize_t end)
{
int kind;
const void *data;
Py_UCS4 *dest;
Py_ssize_t field_limit;

Py_ssize_t len = end - start;
if (len <= 0) {
return 0;
}

field_limit = FT_ATOMIC_LOAD_SSIZE_RELAXED(module_state->field_limit);
if (self->field_len + len > field_limit) {
PyErr_Format(module_state->error_obj,
"field larger than field limit (%zd)",
field_limit);
return -1;
}

while (self->field_len + len > self->field_size) {
if (!parse_grow_buff(self))
return -1;
}

kind = PyUnicode_KIND(lineobj);
data = PyUnicode_DATA(lineobj);
dest = self->field + self->field_len;

for (Py_ssize_t i = 0; i < len; ++i) {
dest[i] = PyUnicode_READ(kind, data, start + i);
}

self->field_len += len;
return 0;
}

static int
parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
{
Expand Down Expand Up @@ -923,11 +962,10 @@ Reader_iternext(PyObject *op)
ReaderObj *self = _ReaderObj_CAST(op);

PyObject *fields = NULL;
Py_UCS4 c;
Py_ssize_t pos, linelen;
int kind;
const void *data;
Py_ssize_t pos, linelen, chunk_end, p;
PyObject *lineobj;
DialectObj *dialect;
Py_UCS4 c;

_csvstate *module_state = _csv_state_from_type(Py_TYPE(self),
"Reader.__next__");
Expand All @@ -937,13 +975,16 @@ Reader_iternext(PyObject *op)

if (parse_reset(self) < 0)
return NULL;

dialect = self->dialect;

do {
lineobj = PyIter_Next(self->input_iter);
if (lineobj == NULL) {
/* End of input OR exception */
if (!PyErr_Occurred() && (self->field_len != 0 ||
self->state == IN_QUOTED_FIELD)) {
if (self->dialect->strict)
if (dialect->strict)
PyErr_SetString(module_state->error_obj,
"unexpected end of data");
else if (parse_save_field(self) >= 0)
Expand All @@ -962,17 +1003,109 @@ Reader_iternext(PyObject *op)
return NULL;
}
++self->line_num;
kind = PyUnicode_KIND(lineobj);
data = PyUnicode_DATA(lineobj);
pos = 0;

linelen = PyUnicode_GET_LENGTH(lineobj);
while (linelen--) {
c = PyUnicode_READ(kind, data, pos);
if (parse_process_char(self, module_state, c) < 0) {
Py_DECREF(lineobj);
goto err;
pos = 0;

while (pos < linelen) {
switch (self->state) {
case IN_FIELD:
chunk_end = linelen;

p = PyUnicode_FindChar(lineobj, dialect->delimiter, pos, linelen, 1);
if (p >= 0 && p < chunk_end) {
chunk_end = p;
} else if (p == -2) {
Py_DECREF(lineobj);
goto err;
}
if (dialect->escapechar != NOT_SET) {
p = PyUnicode_FindChar(lineobj, dialect->escapechar, pos, linelen, 1);
if (p >= 0 && p < chunk_end) {
chunk_end = p;
} else if (p == -2) {
Py_DECREF(lineobj);
goto err;
}
}
p = PyUnicode_FindChar(lineobj, '\n', pos, linelen, 1);
if (p >= 0 && p < chunk_end) {
chunk_end = p;
} else if (p == -2) {
Py_DECREF(lineobj);
goto err;
}
p = PyUnicode_FindChar(lineobj, '\r', pos, linelen, 1);
if (p >= 0 && p < chunk_end) {
chunk_end = p;
} else if (p == -2) {
Py_DECREF(lineobj);
goto err;
}

if (chunk_end > pos) {
if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) {
Py_DECREF(lineobj);
goto err;
}
}
pos = chunk_end;

if (pos < linelen) {
c = PyUnicode_READ_CHAR(lineobj, pos);
if (parse_process_char(self, module_state, c) < 0) {
Py_DECREF(lineobj);
goto err;
}
pos++;
}
break;
case IN_QUOTED_FIELD:
chunk_end = linelen;

p = PyUnicode_FindChar(lineobj, dialect->quotechar, pos, linelen, 1);
if (p >= 0 && p < chunk_end) {
chunk_end = p;
} else if (p == -2) {
Py_DECREF(lineobj);
goto err;
}
if (dialect->escapechar != NOT_SET) {
p = PyUnicode_FindChar(lineobj, dialect->escapechar, pos, linelen, 1);
if (p >= 0 && p < chunk_end) {
chunk_end = p;
} else if (p == -2) {
Py_DECREF(lineobj);
goto err;
}
}

if (chunk_end > pos) {
if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) {
Py_DECREF(lineobj);
goto err;
}
}
pos = chunk_end;

if (pos < linelen) {
c = PyUnicode_READ_CHAR(lineobj, pos);
if (parse_process_char(self, module_state, c) < 0) {
Py_DECREF(lineobj);
goto err;
}
pos++;
}
break;
default:
c = PyUnicode_READ_CHAR(lineobj, pos);
if (parse_process_char(self, module_state, c) < 0) {
Py_DECREF(lineobj);
goto err;
}
pos++;
break;
}
pos++;
}
Py_DECREF(lineobj);
if (parse_process_char(self, module_state, EOL) < 0)
Expand Down
Loading