Skip to content

Commit c07cadc

Browse files
committed
chunk
1 parent e39255e commit c07cadc

File tree

1 file changed

+146
-14
lines changed

1 file changed

+146
-14
lines changed

Modules/_csv.c

Lines changed: 146 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,45 @@ parse_add_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
722722
return 0;
723723
}
724724

725+
static int
726+
parse_add_substring(ReaderObj *self, _csvstate *module_state,
727+
PyObject* lineobj, Py_ssize_t start, Py_ssize_t end)
728+
{
729+
int kind;
730+
const void *data;
731+
Py_UCS4 *dest;
732+
Py_ssize_t field_limit;
733+
734+
Py_ssize_t len = end - start;
735+
if (len <= 0) {
736+
return 0;
737+
}
738+
739+
field_limit = FT_ATOMIC_LOAD_SSIZE_RELAXED(module_state->field_limit);
740+
if (self->field_len + len > field_limit) {
741+
PyErr_Format(module_state->error_obj,
742+
"field larger than field limit (%zd)",
743+
field_limit);
744+
return -1;
745+
}
746+
747+
while (self->field_len + len > self->field_size) {
748+
if (!parse_grow_buff(self))
749+
return -1;
750+
}
751+
752+
kind = PyUnicode_KIND(lineobj);
753+
data = PyUnicode_DATA(lineobj);
754+
dest = self->field + self->field_len;
755+
756+
for (Py_ssize_t i = 0; i < len; ++i) {
757+
dest[i] = PyUnicode_READ(kind, data, start + i);
758+
}
759+
760+
self->field_len += len;
761+
return 0;
762+
}
763+
725764
static int
726765
parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
727766
{
@@ -923,11 +962,9 @@ Reader_iternext(PyObject *op)
923962
ReaderObj *self = _ReaderObj_CAST(op);
924963

925964
PyObject *fields = NULL;
926-
Py_UCS4 c;
927-
Py_ssize_t pos, linelen;
928-
int kind;
929-
const void *data;
965+
Py_ssize_t pos, linelen, chunk_end, p;
930966
PyObject *lineobj;
967+
DialectObj *dialect;
931968

932969
_csvstate *module_state = _csv_state_from_type(Py_TYPE(self),
933970
"Reader.__next__");
@@ -937,13 +974,16 @@ Reader_iternext(PyObject *op)
937974

938975
if (parse_reset(self) < 0)
939976
return NULL;
977+
978+
dialect = self->dialect;
979+
940980
do {
941981
lineobj = PyIter_Next(self->input_iter);
942982
if (lineobj == NULL) {
943983
/* End of input OR exception */
944984
if (!PyErr_Occurred() && (self->field_len != 0 ||
945985
self->state == IN_QUOTED_FIELD)) {
946-
if (self->dialect->strict)
986+
if (dialect->strict)
947987
PyErr_SetString(module_state->error_obj,
948988
"unexpected end of data");
949989
else if (parse_save_field(self) >= 0)
@@ -962,17 +1002,109 @@ Reader_iternext(PyObject *op)
9621002
return NULL;
9631003
}
9641004
++self->line_num;
965-
kind = PyUnicode_KIND(lineobj);
966-
data = PyUnicode_DATA(lineobj);
967-
pos = 0;
1005+
9681006
linelen = PyUnicode_GET_LENGTH(lineobj);
969-
while (linelen--) {
970-
c = PyUnicode_READ(kind, data, pos);
971-
if (parse_process_char(self, module_state, c) < 0) {
972-
Py_DECREF(lineobj);
973-
goto err;
1007+
pos = 0;
1008+
1009+
while (pos < linelen) {
1010+
switch (self->state) {
1011+
case IN_FIELD:
1012+
chunk_end = linelen;
1013+
1014+
p = PyUnicode_FindChar(lineobj, dialect->delimiter, pos, linelen, 1);
1015+
if (p >= 0 && p < chunk_end) {
1016+
chunk_end = p;
1017+
} else if (p == -2) {
1018+
Py_DECREF(lineobj);
1019+
goto err;
1020+
}
1021+
if (dialect->escapechar != NOT_SET) {
1022+
p = PyUnicode_FindChar(lineobj, dialect->escapechar, pos, linelen, 1);
1023+
if (p >= 0 && p < chunk_end) {
1024+
chunk_end = p;
1025+
} else if (p == -2) {
1026+
Py_DECREF(lineobj);
1027+
goto err;
1028+
}
1029+
}
1030+
p = PyUnicode_FindChar(lineobj, '\n', pos, linelen, 1);
1031+
if (p >= 0 && p < chunk_end) {
1032+
chunk_end = p;
1033+
} else if (p == -2) {
1034+
Py_DECREF(lineobj);
1035+
goto err;
1036+
}
1037+
p = PyUnicode_FindChar(lineobj, '\r', pos, linelen, 1);
1038+
if (p >= 0 && p < chunk_end) {
1039+
chunk_end = p;
1040+
} else if (p == -2) {
1041+
Py_DECREF(lineobj);
1042+
goto err;
1043+
}
1044+
1045+
if (chunk_end > pos) {
1046+
if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) {
1047+
Py_DECREF(lineobj);
1048+
goto err;
1049+
}
1050+
}
1051+
pos = chunk_end;
1052+
1053+
if (pos < linelen) {
1054+
Py_UCS4 c = PyUnicode_READ_CHAR(lineobj, pos);
1055+
if (parse_process_char(self, module_state, c) < 0) {
1056+
Py_DECREF(lineobj);
1057+
goto err;
1058+
}
1059+
pos++;
1060+
}
1061+
break;
1062+
case IN_QUOTED_FIELD:
1063+
chunk_end = linelen;
1064+
1065+
p = PyUnicode_FindChar(lineobj, dialect->quotechar, pos, linelen, 1);
1066+
if (p >= 0 && p < chunk_end) {
1067+
chunk_end = p;
1068+
} else if (p == -2) {
1069+
Py_DECREF(lineobj);
1070+
goto err;
1071+
}
1072+
if (dialect->escapechar != NOT_SET) {
1073+
p = PyUnicode_FindChar(lineobj, dialect->escapechar, pos, linelen, 1);
1074+
if (p >= 0 && p < chunk_end) {
1075+
chunk_end = p;
1076+
} else if (p == -2) {
1077+
Py_DECREF(lineobj);
1078+
goto err;
1079+
}
1080+
}
1081+
1082+
if (chunk_end > pos) {
1083+
if (parse_add_substring(self, module_state, lineobj, pos, chunk_end) < 0) {
1084+
Py_DECREF(lineobj);
1085+
goto err;
1086+
}
1087+
}
1088+
pos = chunk_end;
1089+
1090+
if (pos < linelen) {
1091+
Py_UCS4 c = PyUnicode_READ_CHAR(lineobj, pos);
1092+
if (parse_process_char(self, module_state, c) < 0) {
1093+
Py_DECREF(lineobj);
1094+
goto err;
1095+
}
1096+
pos++;
1097+
}
1098+
break;
1099+
default:
1100+
Py_UCS4 c = PyUnicode_READ_CHAR(lineobj, pos);
1101+
if (parse_process_char(self, module_state, c) < 0) {
1102+
Py_DECREF(lineobj);
1103+
goto err;
1104+
}
1105+
pos++;
1106+
break;
9741107
}
975-
pos++;
9761108
}
9771109
Py_DECREF(lineobj);
9781110
if (parse_process_char(self, module_state, EOL) < 0)

0 commit comments

Comments
 (0)