@@ -18,6 +18,22 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
1818from  opteryx.third_party.fastfloat.fast_float cimport c_parse_fast_float
1919
2020import  orjson as  json
21+ import  platform
22+ 
23+ cdef extern from  " simd_search.h"  :
24+     size_t neon_count(const char *  data, size_t length, char  target)
25+     size_t avx_count(const char *  data, size_t length, char  target)
26+ 
27+ 
28+ #  Detect architecture at module initialization and select the appropriate SIMD function
29+ cdef size_t (* simd_count)(const char * , size_t, char )
30+ 
31+ #  Detect CPU architecture once at module load
32+ _arch =  platform.machine().lower()
33+ if  _arch in  (' arm64'  , ' aarch64'  ):
34+     simd_count =  neon_count
35+ else :
36+     simd_count =  avx_count
2137
2238
2339cdef enum  ColumnType:
@@ -107,20 +123,19 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
107123        remaining =  end -  pos
108124        if  remaining <=  0 :
109125            return  NULL 
110-         key_pos =  < const char * > memchr(pos, b ' " '  , < size_t> remaining)
126+         key_pos =  < const char * > memchr(pos, 34 , < size_t> remaining)   #  '"' 
111127        if  key_pos ==  NULL :
112128            return  NULL 
113129
114130        key_pos +=  1   #  Move past the opening quote
115131
116132        #  Check if this matches our key
117-         if  (end -  key_pos >=  key_len and  memcmp(key_pos, key, < size_t> key_len) ==  0  and  key_pos[key_len] ==  b' "'  ):
118- 
133+         if  (end -  key_pos >=  key_len and  memcmp(key_pos, key, < size_t> key_len) ==  0  and  key_pos[key_len] ==  34 ):  #  '"'
119134            #  Found the key, now find the colon
120135            value_pos =  key_pos +  key_len +  1   #  Skip closing quote
121136
122137            #  Skip whitespace and colon
123-             while  value_pos <  end and  (value_pos[0 ] ==  b '   '   or  value_pos[0 ] ==  b ' \t '   or  value_pos[0 ] ==  b ' : ' ): 
138+             while  value_pos <  end and  (value_pos[0 ] ==  32   or  value_pos[0 ] ==  9  or  value_pos[0 ] ==  58 ):   #  ' ', '\t', ':' 
124139                value_pos +=  1 
125140
126141            if  value_pos >=  end:
@@ -130,84 +145,84 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
130145            value_start[0 ] =  value_pos -  line
131146
132147            #  Determine value type and find end
133-             if  first_char ==  b ' "' : 
148+             if  first_char ==  34 :   #   '"'
134149                #  String value - find closing quote, handling escapes
135150                quote_start =  value_pos +  1 
136151                quote_end =  quote_start
137152                backslash_run =  0 
138153                while  quote_end <  end:
139-                     if  quote_end[0 ] ==  b ' \\ ' : 
154+                     if  quote_end[0 ] ==  92 :   #   '\\'
140155                        backslash_run +=  1 
141156                    else :
142-                         if  quote_end[0 ] ==  b ' " '   and  (backslash_run &  1 ) ==  0 :
157+                         if  quote_end[0 ] ==  34  and  (backslash_run &  1 ) ==  0 :   #  '"' 
143158                            #  Found unescaped quote
144159                            value_len[0 ] =  (quote_end +  1 ) -  value_pos
145160                            return  value_pos
146161                        backslash_run =  0 
147162                    quote_end +=  1 
148163                return  NULL 
149164
150-             elif  first_char ==  b ' {' : 
165+             elif  first_char ==  123 :   #   '{'
151166                #  Object - count braces
152167                brace_count =  1 
153168                quote_end =  value_pos +  1 
154169                while  quote_end <  end and  brace_count >  0 :
155-                     if  quote_end[0 ] ==  b ' {' : 
170+                     if  quote_end[0 ] ==  123 :   #   '{'
156171                        brace_count +=  1 
157-                     elif  quote_end[0 ] ==  b ' }' : 
172+                     elif  quote_end[0 ] ==  125 :   #   '}'
158173                        brace_count -=  1 
159-                     elif  quote_end[0 ] ==  b ' "' : 
174+                     elif  quote_end[0 ] ==  34 :   #   '"'
160175                        #  Skip string contents to avoid premature brace counting
161176                        quote_end +=  1 
162177                        while  quote_end <  end:
163-                             if  quote_end[0 ] ==  b ' \\ ' : 
178+                             if  quote_end[0 ] ==  92 :   #  '\' 
164179                                quote_end +=  2 
165180                                continue 
166-                             if  quote_end[0 ] ==  b ' "' : 
181+                             if  quote_end[0 ] ==  34 :   #   '"'
167182                                break 
168183                            quote_end +=  1 
169184                    quote_end +=  1 
170185                value_len[0 ] =  quote_end -  value_pos
171186                return  value_pos
172187
173-             elif  first_char ==  b ' [' : 
188+             elif  first_char ==  91 :   #   '['
174189                #  Array - count brackets
175190                bracket_count =  1 
176191                quote_end =  value_pos +  1 
177192                while  quote_end <  end and  bracket_count >  0 :
178-                     if  quote_end[0 ] ==  b ' [' : 
193+                     if  quote_end[0 ] ==  91 :   #   '['
179194                        bracket_count +=  1 
180-                     elif  quote_end[0 ] ==  b ' ]' : 
195+                     elif  quote_end[0 ] ==  93 :   #   ']'
181196                        bracket_count -=  1 
182-                     elif  quote_end[0 ] ==  b ' "' : 
197+                     elif  quote_end[0 ] ==  34 :   #   '"'
183198                        #  Skip string contents inside arrays
184199                        quote_end +=  1 
185200                        while  quote_end <  end:
186-                             if  quote_end[0 ] ==  b ' \\ '  :
201+                             if  quote_end[0 ] ==  92 :
187202                                quote_end +=  2 
188203                                continue 
189-                             if  quote_end[0 ] ==  b ' "' : 
204+                             if  quote_end[0 ] ==  34 :   #   '"'
190205                                break 
191206                            quote_end +=  1 
192207                    quote_end +=  1 
193208                value_len[0 ] =  quote_end -  value_pos
194209                return  value_pos
195210
196-             elif  first_char ==  b ' n' : 
211+             elif  first_char ==  110 :   #   'n'
197212                #  null
198213                if  end -  value_pos >=  4  and  memcmp(value_pos, b" null"  , 4 ) ==  0 :
199214                    value_len[0 ] =  4 
200215                    return  value_pos
201216                return  NULL 
202217
203-             elif  first_char ==  b ' t' : 
218+             elif  first_char ==  116 :   #   't'
204219                #  true
205220                if  end -  value_pos >=  4  and  memcmp(value_pos, b" true"  , 4 ) ==  0 :
206221                    value_len[0 ] =  4 
207222                    return  value_pos
208223                return  NULL 
209224
210-             elif  first_char ==  b ' f' : 
225+             elif  first_char ==  102 :   #   'f'
211226                #  false
212227                if  end -  value_pos >=  5  and  memcmp(value_pos, b" false"  , 5 ) ==  0 :
213228                    value_len[0 ] =  5 
@@ -219,7 +234,7 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
219234                quote_end =  value_pos +  1 
220235                while  quote_end <  end:
221236                    #  Check for delimiter characters
222-                     if  quote_end[0 ] ==  b '   '   or  quote_end[0 ] ==  b ' , '   or  quote_end[0 ] ==  b ' } '   or  quote_end[0 ] ==  b ' ] '   or  quote_end[0 ] ==  b ' \t '   or  quote_end[0 ] ==  b ' \n ' : 
237+                     if  quote_end[0 ] ==  32   or  quote_end[0 ] ==  44  or  quote_end[0 ] ==  125  or  quote_end[0 ] ==  93  or  quote_end[0 ] ==  9  or  quote_end[0 ] ==  10 :   #  ' ', ',', '}', ']', '\t', '\n' 
223238                        value_len[0 ] =  quote_end -  value_pos
224239                        return  value_pos
225240                    quote_end +=  1 
@@ -259,7 +274,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
259274    cdef Py_ssize_t key_len
260275    cdef str  col_type
261276    cdef dict  result =  {}
262-     cdef Py_ssize_t num_lines  =   0 
277+     cdef Py_ssize_t num_lines
263278    cdef Py_ssize_t i
264279    cdef Py_ssize_t num_cols =  len (column_names)
265280    cdef list  column_lists =  []
@@ -270,10 +285,12 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
270285    cdef int  type_code
271286    cdef list  col_list
272287    cdef bytes value_bytes
273-     cdef str  value_str
274288    cdef object  parsed
275289    cdef Py_ssize_t remaining
276290    cdef const char *  newline_pos
291+     cdef size_t line_count
292+     cdef size_t estimated_lines
293+     cdef Py_ssize_t line_index =  0 
277294
278295    result =  {}
279296
@@ -303,12 +320,24 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
303320            col_type =  column_types.get(col, ' str'  )
304321            type_codes[i] =  _column_type_code(col_type)
305322
323+         #  Pre-count lines to preallocate arrays (using architecture-appropriate SIMD function)
324+         line_count =  simd_count(data, data_len, 10 )  #  Count '\n' (ASCII 10)
325+         if  data_len >  0  and  data[data_len -  1 ] !=  10 :  #  Doesn't end with '\n'
326+             estimated_lines =  line_count +  1 
327+         else :
328+             estimated_lines =  line_count
329+ 
330+         #  Preallocate column lists
331+         for  i in  range (num_cols):
332+             col_list =  column_lists[i]
333+             column_lists[i] =  [None ] *  estimated_lines
334+ 
306335        while  pos <  end:
307336            line_start =  pos
308337            remaining =  end -  line_start
309338            if  remaining <=  0 :
310339                break 
311-             newline_pos =  < const char * > memchr(line_start, b ' \n '  , < size_t> remaining)
340+             newline_pos =  < const char * > memchr(line_start, 10 , < size_t> remaining)   #  '\n' 
312341            if  newline_pos ==  NULL :
313342                line_end =  end
314343                pos =  end
@@ -317,11 +346,11 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
317346                pos =  newline_pos +  1 
318347
319348            line_len =  line_end -  line_start
320-             num_lines +=  1 
349+             #   num_lines += 1  # Removed, using line_index instead 
321350
322351            if  line_len ==  0 :
323-                 for  i  in   range (num_cols): 
324-                     ( < list > column_lists[i]).append( None ) 
352+                 #  Already pre-filled with None 
353+                 line_index  +=   1 
325354                continue 
326355
327356            for  i in  range (num_cols):
@@ -333,58 +362,65 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
333362                value_ptr =  find_key_value(line_start, line_len, key_ptr, key_len, & value_start, & value_len)
334363
335364                if  value_ptr ==  NULL :
336-                     col_list.append( None ) 
365+                     #  Already  None
337366                    continue 
338367
339368                if  type_code ==  COL_BOOL:
340369                    if  value_len ==  4  and  memcmp(value_ptr, b" true"  , 4 ) ==  0 :
341-                         col_list.append( True ) 
370+                         col_list[line_index]  =   True 
342371                    elif  value_len ==  5  and  memcmp(value_ptr, b" false"  , 5 ) ==  0 :
343-                         col_list.append(False )
344-                     else :
345-                         col_list.append(None )
372+                         col_list[line_index] =  False 
373+                     #  else already None
346374
347375                elif  type_code ==  COL_INT:
348376                    if  value_len ==  4  and  memcmp(value_ptr, b" null"  , 4 ) ==  0 :
349-                         col_list.append(None )
377+                         #  Already None
378+                         pass 
350379                    else :
351-                         col_list.append( fast_atoll(value_ptr, value_len) )
380+                         col_list[line_index]  =   fast_atoll(value_ptr, value_len)
352381
353382                elif  type_code ==  COL_FLOAT:
354383                    if  value_len ==  4  and  memcmp(value_ptr, b" null"  , 4 ) ==  0 :
355-                         col_list.append(None )
384+                         #  Already None
385+                         pass 
356386                    else :
357387                        value_bytes =  PyBytes_FromStringAndSize(value_ptr, value_len)
358-                         col_list.append( c_parse_fast_float(value_bytes) )
388+                         col_list[line_index]  =   c_parse_fast_float(value_bytes)
359389
360390                elif  type_code ==  COL_STR:
361391                    if  value_len ==  4  and  memcmp(value_ptr, b" null"  , 4 ) ==  0 :
362-                         col_list.append(None )
363-                     elif  value_ptr[0 ] ==  b' "'   and  value_len >=  2 :
364-                         value_bytes =  PyBytes_FromStringAndSize(value_ptr +  1 , value_len -  2 )
365-                         try :
366-                             value_str =  value_bytes.decode(' utf-8'  )
367-                             value_str =  value_str.replace(' \\ n'  , ' \n '  ).replace(' \\ t'  , ' \t '  ).replace(' \\ "'  , ' "'  ).replace(' \\\\ '  , ' \\ '  )
368-                             col_list.append(value_str)
369-                         except  UnicodeDecodeError :
370-                             col_list.append(None )
392+                         #  Already None
393+                         pass 
371394                    else :
372-                         col_list.append(None )
395+                         value_bytes =  PyBytes_FromStringAndSize(value_ptr, value_len)
396+                         try :
397+                             parsed =  json.loads(value_bytes)
398+                             if  isinstance (parsed, str ):
399+                                 col_list[line_index] =  parsed
400+                             #  else already None
401+                         except  (json.JSONDecodeError, UnicodeDecodeError ):
402+                             #  Already None
403+                             pass 
373404
374405                else :
375406                    if  value_len ==  4  and  memcmp(value_ptr, b" null"  , 4 ) ==  0 :
376-                         col_list.append(None )
407+                         #  Already None
408+                         pass 
377409                    else :
378410                        value_bytes =  PyBytes_FromStringAndSize(value_ptr, value_len)
379411                        try :
380412                            parsed =  json.loads(value_bytes)
381413                            if  isinstance (parsed, dict ):
382-                                 col_list.append( json.dumps(parsed) )
414+                                 col_list[line_index]  =   json.dumps(parsed)
383415                            else :
384-                                 col_list.append( parsed) 
416+                                 col_list[line_index]  =   parsed
385417                        except  (json.JSONDecodeError, UnicodeDecodeError ):
386-                             col_list.append(None )
418+                             #  Already None
419+                             pass 
420+ 
421+             line_index +=  1 
387422
423+         num_lines =  line_index
388424        return  (num_lines, num_cols, result)
389425    finally :
390426        if  type_codes !=  NULL :
0 commit comments