chunk

maurycy · maurycy · commit c07cadccb7ac · 2025-08-28T01:45:49.000+02:00
diff --git a/Modules/_csv.c b/Modules/_csv.c
@@ -722,6 +722,45 @@ parse_add_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
     return 0;
 }
 
+static int
+parse_add_substring(ReaderObj *self, _csvstate *module_state,
+                    PyObject* lineobj, Py_ssize_t start, Py_ssize_t end)
+{
+    int kind;
+    const void *data;
+    Py_UCS4 *dest;
+    Py_ssize_t field_limit;
+
+    Py_ssize_t len = end - start;
+    if (len <= 0) {
+        return 0;
+    }
+
+    field_limit = FT_ATOMIC_LOAD_SSIZE_RELAXED(module_state->field_limit);
+    if (self->field_len + len > field_limit) {
+        PyErr_Format(module_state->error_obj,
+                     "field larger than field limit (%zd)",
+                     field_limit);
+        return -1;
+    }
+
+    while (self->field_len + len > self->field_size) {
+        if (!parse_grow_buff(self))
+            return -1;
+    }
+
+    kind = PyUnicode_KIND(lineobj);
+    data = PyUnicode_DATA(lineobj);
+    dest = self->field + self->field_len;
+
+    for (Py_ssize_t i = 0; i < len; ++i) {
+        dest[i] = PyUnicode_READ(kind, data, start + i);
+    }
+
+    self->field_len += len;
+    return 0;
+}
+
 static int
 parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
 {
@@ -923,11 +962,9 @@ Reader_iternext(PyObject *op)
     ReaderObj *self = _ReaderObj_CAST(op);
 
     PyObject *fields = NULL;
-    Py_UCS4 c;
-    Py_ssize_t pos, linelen;
-    int kind;
-    const void *data;
+    Py_ssize_t pos, linelen, chunk_end, p;
     PyObject *lineobj;
+    DialectObj *dialect;
 
     _csvstate *module_state = _csv_state_from_type(Py_TYPE(self),
                                                    "Reader.__next__");
@@ -937,13 +974,16 @@ Reader_iternext(PyObject *op)
 
     if (parse_reset(self) < 0)
         return NULL;
+
+    dialect = self->dialect;
+
     do {
         lineobj = PyIter_Next(self->input_iter);
         if (lineobj == NULL) {
             /* End of input OR exception */
             if (!PyErr_Occurred() && (self->field_len != 0 ||
                                       self->state == IN_QUOTED_FIELD)) {
-                if (self->dialect->strict)
+                if (dialect->strict)
                     PyErr_SetString(module_state->error_obj,
                                     "unexpected end of data");
                 else if (parse_save_field(self) >= 0)
@@ -962,17 +1002,109 @@ Reader_iternext(PyObject *op)
             return NULL;
         }
         ++self->line_num;
-        kind = PyUnicode_KIND(lineobj);
-        data = PyUnicode_DATA(lineobj);
-        pos = 0;
+
         linelen = PyUnicode_GET_LENGTH(lineobj);
-        while (linelen--) {
-            c = PyUnicode_READ(kind, data, pos);
-            if (parse_process_char(self, module_state, c) < 0) {
-                Py_DECREF(lineobj);
-                goto err;
+        pos = 0;
+
+        while (pos < linelen) {
+            switch (self->state) {
+            case IN_FIELD:
+                chunk_end = linelen;
+
+                p = PyUnicode_FindChar(lineobj, dialect->delimiter, pos, linelen, 1);
+                if (p >= 0 && p < chunk_end) {
+                    chunk_end = p;
+                } else if (p == -2) {
+                    Py_DECREF(lineobj);
+                    goto err;
+                }
+                if (dialect->escapechar != NOT_SET) {
+                    p = PyUnicode_FindChar(lineobj, dialect->escapechar, pos, linelen, 1);
+                    if (p >= 0 && p < chunk_end) {
+                        chunk_end = p;
+                    } else if (p == -2) {
+                        Py_DECREF(lineobj);
+                        goto err;
+                    }
+                }
+                p = PyUnicode_FindChar(lineobj, '\n', pos, linelen, 1);
+                if (p >= 0 && p < chunk_end) {
+                    chunk_end = p;
+                } else if (p == -2) {
+                    Py_DECREF(lineobj);
+                    goto err;
+                }
+                p = PyUnicode_FindChar(lineobj, '\r', pos, linelen, 1);
+                if (p >= 0 && p < chunk_end) {
+                    chunk_end = p;
+                } else if (p == -2) {
+                    Py_DECREF(lineobj);
+                    goto err;
+                }
+
+                if (chunk_end > pos) {
+                    if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) {
+                        Py_DECREF(lineobj);
+                        goto err;
+                    }
+                }
+                pos = chunk_end;
+
+                if (pos < linelen) {
+                    Py_UCS4 c = PyUnicode_READ_CHAR(lineobj, pos);
+                    if (parse_process_char(self, module_state, c) < 0) {
+                        Py_DECREF(lineobj);
+                        goto err;
+                    }
+                    pos++;
+                }
+                break;
+            case IN_QUOTED_FIELD:
+                chunk_end = linelen;
+
+                p = PyUnicode_FindChar(lineobj, dialect->quotechar, pos, linelen, 1);
+                if (p >= 0 && p < chunk_end) {
+                    chunk_end = p;
+                } else if (p == -2) {
+                    Py_DECREF(lineobj);
+                    goto err;
+                }
+                if (dialect->escapechar != NOT_SET) {
+                    p = PyUnicode_FindChar(lineobj, dialect->escapechar, pos, linelen, 1);
+                    if (p >= 0 && p < chunk_end) {
+                        chunk_end = p;
+                    } else if (p == -2) {
+                        Py_DECREF(lineobj);
+                        goto err;
+                    }
+                }
+
+                if (chunk_end > pos) {
+                    if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) {
+                        Py_DECREF(lineobj);
+                        goto err;
+                    }
+                }
+                pos = chunk_end;
+
+                if (pos < linelen) {
+                    Py_UCS4 c = PyUnicode_READ_CHAR(lineobj, pos);
+                    if (parse_process_char(self, module_state, c) < 0) {
+                        Py_DECREF(lineobj);
+                        goto err;
+                    }
+                    pos++;
+                }
+                break;
+            default:
+                Py_UCS4 c = PyUnicode_READ_CHAR(lineobj, pos);
+                if (parse_process_char(self, module_state, c) < 0) {
+                    Py_DECREF(lineobj);
+                    goto err;
+                }
+                pos++;
+                break;
             }
-            pos++;
         }
         Py_DECREF(lineobj);
         if (parse_process_char(self, module_state, EOL) < 0)