@@ -722,6 +722,45 @@ parse_add_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
722
722
return 0 ;
723
723
}
724
724
725
+ static int
726
+ parse_add_substring (ReaderObj * self , _csvstate * module_state ,
727
+ PyObject * lineobj , Py_ssize_t start , Py_ssize_t end )
728
+ {
729
+ int kind ;
730
+ const void * data ;
731
+ Py_UCS4 * dest ;
732
+ Py_ssize_t field_limit ;
733
+
734
+ Py_ssize_t len = end - start ;
735
+ if (len <= 0 ) {
736
+ return 0 ;
737
+ }
738
+
739
+ field_limit = FT_ATOMIC_LOAD_SSIZE_RELAXED (module_state -> field_limit );
740
+ if (self -> field_len + len > field_limit ) {
741
+ PyErr_Format (module_state -> error_obj ,
742
+ "field larger than field limit (%zd)" ,
743
+ field_limit );
744
+ return -1 ;
745
+ }
746
+
747
+ while (self -> field_len + len > self -> field_size ) {
748
+ if (!parse_grow_buff (self ))
749
+ return -1 ;
750
+ }
751
+
752
+ kind = PyUnicode_KIND (lineobj );
753
+ data = PyUnicode_DATA (lineobj );
754
+ dest = self -> field + self -> field_len ;
755
+
756
+ for (Py_ssize_t i = 0 ; i < len ; ++ i ) {
757
+ dest [i ] = PyUnicode_READ (kind , data , start + i );
758
+ }
759
+
760
+ self -> field_len += len ;
761
+ return 0 ;
762
+ }
763
+
725
764
static int
726
765
parse_process_char (ReaderObj * self , _csvstate * module_state , Py_UCS4 c )
727
766
{
@@ -923,11 +962,9 @@ Reader_iternext(PyObject *op)
923
962
ReaderObj * self = _ReaderObj_CAST (op );
924
963
925
964
PyObject * fields = NULL ;
926
- Py_UCS4 c ;
927
- Py_ssize_t pos , linelen ;
928
- int kind ;
929
- const void * data ;
965
+ Py_ssize_t pos , linelen , chunk_end , p ;
930
966
PyObject * lineobj ;
967
+ DialectObj * dialect ;
931
968
932
969
_csvstate * module_state = _csv_state_from_type (Py_TYPE (self ),
933
970
"Reader.__next__" );
@@ -937,13 +974,16 @@ Reader_iternext(PyObject *op)
937
974
938
975
if (parse_reset (self ) < 0 )
939
976
return NULL ;
977
+
978
+ dialect = self -> dialect ;
979
+
940
980
do {
941
981
lineobj = PyIter_Next (self -> input_iter );
942
982
if (lineobj == NULL ) {
943
983
/* End of input OR exception */
944
984
if (!PyErr_Occurred () && (self -> field_len != 0 ||
945
985
self -> state == IN_QUOTED_FIELD )) {
946
- if (self -> dialect -> strict )
986
+ if (dialect -> strict )
947
987
PyErr_SetString (module_state -> error_obj ,
948
988
"unexpected end of data" );
949
989
else if (parse_save_field (self ) >= 0 )
@@ -962,17 +1002,109 @@ Reader_iternext(PyObject *op)
962
1002
return NULL ;
963
1003
}
964
1004
++ self -> line_num ;
965
- kind = PyUnicode_KIND (lineobj );
966
- data = PyUnicode_DATA (lineobj );
967
- pos = 0 ;
1005
+
968
1006
linelen = PyUnicode_GET_LENGTH (lineobj );
969
- while (linelen -- ) {
970
- c = PyUnicode_READ (kind , data , pos );
971
- if (parse_process_char (self , module_state , c ) < 0 ) {
972
- Py_DECREF (lineobj );
973
- goto err ;
1007
+ pos = 0 ;
1008
+
1009
+ while (pos < linelen ) {
1010
+ switch (self -> state ) {
1011
+ case IN_FIELD :
1012
+ chunk_end = linelen ;
1013
+
1014
+ p = PyUnicode_FindChar (lineobj , dialect -> delimiter , pos , linelen , 1 );
1015
+ if (p >= 0 && p < chunk_end ) {
1016
+ chunk_end = p ;
1017
+ } else if (p == -2 ) {
1018
+ Py_DECREF (lineobj );
1019
+ goto err ;
1020
+ }
1021
+ if (dialect -> escapechar != NOT_SET ) {
1022
+ p = PyUnicode_FindChar (lineobj , dialect -> escapechar , pos , linelen , 1 );
1023
+ if (p >= 0 && p < chunk_end ) {
1024
+ chunk_end = p ;
1025
+ } else if (p == -2 ) {
1026
+ Py_DECREF (lineobj );
1027
+ goto err ;
1028
+ }
1029
+ }
1030
+ p = PyUnicode_FindChar (lineobj , '\n' , pos , linelen , 1 );
1031
+ if (p >= 0 && p < chunk_end ) {
1032
+ chunk_end = p ;
1033
+ } else if (p == -2 ) {
1034
+ Py_DECREF (lineobj );
1035
+ goto err ;
1036
+ }
1037
+ p = PyUnicode_FindChar (lineobj , '\r' , pos , linelen , 1 );
1038
+ if (p >= 0 && p < chunk_end ) {
1039
+ chunk_end = p ;
1040
+ } else if (p == -2 ) {
1041
+ Py_DECREF (lineobj );
1042
+ goto err ;
1043
+ }
1044
+
1045
+ if (chunk_end > pos ) {
1046
+ if (parse_add_substring (self , module_state , lineobj , pos , chunk_end ) < 0 ) {
1047
+ Py_DECREF (lineobj );
1048
+ goto err ;
1049
+ }
1050
+ }
1051
+ pos = chunk_end ;
1052
+
1053
+ if (pos < linelen ) {
1054
+ Py_UCS4 c = PyUnicode_READ_CHAR (lineobj , pos );
1055
+ if (parse_process_char (self , module_state , c ) < 0 ) {
1056
+ Py_DECREF (lineobj );
1057
+ goto err ;
1058
+ }
1059
+ pos ++ ;
1060
+ }
1061
+ break ;
1062
+ case IN_QUOTED_FIELD :
1063
+ chunk_end = linelen ;
1064
+
1065
+ p = PyUnicode_FindChar (lineobj , dialect -> quotechar , pos , linelen , 1 );
1066
+ if (p >= 0 && p < chunk_end ) {
1067
+ chunk_end = p ;
1068
+ } else if (p == -2 ) {
1069
+ Py_DECREF (lineobj );
1070
+ goto err ;
1071
+ }
1072
+ if (dialect -> escapechar != NOT_SET ) {
1073
+ p = PyUnicode_FindChar (lineobj , dialect -> escapechar , pos , linelen , 1 );
1074
+ if (p >= 0 && p < chunk_end ) {
1075
+ chunk_end = p ;
1076
+ } else if (p == -2 ) {
1077
+ Py_DECREF (lineobj );
1078
+ goto err ;
1079
+ }
1080
+ }
1081
+
1082
+ if (chunk_end > pos ) {
1083
+ if (parse_add_substring (self , module_state , lineobj , pos , chunk_end ) < 0 ) {
1084
+ Py_DECREF (lineobj );
1085
+ goto err ;
1086
+ }
1087
+ }
1088
+ pos = chunk_end ;
1089
+
1090
+ if (pos < linelen ) {
1091
+ Py_UCS4 c = PyUnicode_READ_CHAR (lineobj , pos );
1092
+ if (parse_process_char (self , module_state , c ) < 0 ) {
1093
+ Py_DECREF (lineobj );
1094
+ goto err ;
1095
+ }
1096
+ pos ++ ;
1097
+ }
1098
+ break ;
1099
+ default :
1100
+ Py_UCS4 c = PyUnicode_READ_CHAR (lineobj , pos );
1101
+ if (parse_process_char (self , module_state , c ) < 0 ) {
1102
+ Py_DECREF (lineobj );
1103
+ goto err ;
1104
+ }
1105
+ pos ++ ;
1106
+ break ;
974
1107
}
975
- pos ++ ;
976
1108
}
977
1109
Py_DECREF (lineobj );
978
1110
if (parse_process_char (self , module_state , EOL ) < 0 )
0 commit comments