1010Fast JSONL decoder using Cython for performance-critical operations.
1111"""
1212
13- from libc.string cimport memchr, strlen, strstr, memcmp
14- from libc.stdlib cimport strtod, strtol, atoi
13+ from libc.string cimport memchr, memcmp
1514from libc.stddef cimport size_t
1615from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_GET_SIZE
17- from libc.stdint cimport int64_t
1816from cpython.mem cimport PyMem_Malloc, PyMem_Free
1917
20- import pyarrow
2118import json
2219
2320
@@ -48,7 +45,7 @@ cdef inline int _column_type_code(str col_type):
4845cdef inline const char * find_key_value(const char * line, Py_ssize_t line_len, const char * key, Py_ssize_t key_len, Py_ssize_t* value_start, Py_ssize_t* value_len):
4946 """
5047 Find the value for a given key in a JSON line.
51-
48+
5249 Returns pointer to value start, or NULL if not found.
5350 Updates value_start and value_len with the position and length.
5451 """
@@ -63,7 +60,7 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
6360 cdef int bracket_count
6461 cdef int backslash_run
6562 cdef Py_ssize_t remaining
66-
63+
6764 # Search for the key pattern: "key":
6865 while pos < end:
6966 # Find opening quote of a key
@@ -73,27 +70,25 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
7370 key_pos = < const char * > memchr(pos, b' "' , < size_t> remaining)
7471 if key_pos == NULL :
7572 return NULL
76-
73+
7774 key_pos += 1 # Move past the opening quote
78-
75+
7976 # Check if this matches our key
80- if (end - key_pos >= key_len and
81- memcmp(key_pos, key, < size_t> key_len) == 0 and
82- key_pos[key_len] == b' "' ):
83-
77+ if (end - key_pos >= key_len and memcmp(key_pos, key, < size_t> key_len) == 0 and key_pos[key_len] == b' "' ):
78+
8479 # Found the key, now find the colon
8580 value_pos = key_pos + key_len + 1 # Skip closing quote
86-
81+
8782 # Skip whitespace and colon
8883 while value_pos < end and (value_pos[0 ] == b' ' or value_pos[0 ] == b' \t ' or value_pos[0 ] == b' :' ):
8984 value_pos += 1
90-
85+
9186 if value_pos >= end:
9287 return NULL
93-
88+
9489 first_char = value_pos[0 ]
9590 value_start[0 ] = value_pos - line
96-
91+
9792 # Determine value type and find end
9893 if first_char == b' "' :
9994 # String value - find closing quote, handling escapes
@@ -111,7 +106,7 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
111106 backslash_run = 0
112107 quote_end += 1
113108 return NULL
114-
109+
115110 elif first_char == b' {' :
116111 # Object - count braces
117112 brace_count = 1
@@ -134,7 +129,7 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
134129 quote_end += 1
135130 value_len[0 ] = quote_end - value_pos
136131 return value_pos
137-
132+
138133 elif first_char == b' [' :
139134 # Array - count brackets
140135 bracket_count = 1
@@ -157,28 +152,28 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
157152 quote_end += 1
158153 value_len[0 ] = quote_end - value_pos
159154 return value_pos
160-
155+
161156 elif first_char == b' n' :
162157 # null
163158 if end - value_pos >= 4 and memcmp(value_pos, b" null" , 4 ) == 0 :
164159 value_len[0 ] = 4
165160 return value_pos
166161 return NULL
167-
162+
168163 elif first_char == b' t' :
169164 # true
170165 if end - value_pos >= 4 and memcmp(value_pos, b" true" , 4 ) == 0 :
171166 value_len[0 ] = 4
172167 return value_pos
173168 return NULL
174-
169+
175170 elif first_char == b' f' :
176171 # false
177172 if end - value_pos >= 5 and memcmp(value_pos, b" false" , 5 ) == 0 :
178173 value_len[0 ] = 5
179174 return value_pos
180175 return NULL
181-
176+
182177 else :
183178 # Number - find end (space, comma, brace, bracket)
184179 quote_end = value_pos + 1
@@ -190,22 +185,22 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
190185 quote_end += 1
191186 value_len[0 ] = quote_end - value_pos
192187 return value_pos
193-
188+
194189 pos = key_pos
195-
190+
196191 return NULL
197192
198193
199194cpdef fast_jsonl_decode_columnar(bytes buffer , list column_names, dict column_types, Py_ssize_t sample_size = 100 ):
200195 """
201196 Fast JSONL decoder that extracts values using C string operations.
202-
197+
203198 Parameters:
204199 buffer: bytes - The JSONL data
205200 column_names: list - List of column names to extract
206201 column_types: dict - Dictionary mapping column names to types ('bool', 'int', 'float', 'str', etc.)
207202 sample_size: int - Number of lines to use for schema inference (not used if column_types provided)
208-
203+
209204 Returns:
210205 tuple: (num_rows, num_cols, dict of column_name -> list of values)
211206 """
@@ -239,9 +234,9 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
239234 cdef object parsed
240235 cdef Py_ssize_t remaining
241236 cdef const char * newline_pos
242-
237+
243238 result = {}
244-
239+
245240 if num_cols > 0 :
246241 type_codes = < int * > PyMem_Malloc(num_cols * sizeof(int ))
247242 key_ptrs = < const char ** > PyMem_Malloc(num_cols * sizeof(const char * ))
@@ -254,7 +249,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
254249 if key_lengths != NULL :
255250 PyMem_Free(key_lengths)
256251 raise MemoryError ()
257-
252+
258253 try :
259254 for i in range (num_cols):
260255 col = column_names[i]
@@ -267,7 +262,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
267262 result[col] = col_list
268263 col_type = column_types.get(col, ' str' )
269264 type_codes[i] = _column_type_code(col_type)
270-
265+
271266 while pos < end:
272267 line_start = pos
273268 remaining = end - line_start
@@ -280,35 +275,35 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
280275 else :
281276 line_end = newline_pos
282277 pos = newline_pos + 1
283-
278+
284279 line_len = line_end - line_start
285280 num_lines += 1
286-
281+
287282 if line_len == 0 :
288283 for i in range (num_cols):
289284 (< list > column_lists[i]).append(None )
290285 continue
291-
286+
292287 for i in range (num_cols):
293288 col_list = < list > column_lists[i]
294289 key_ptr = key_ptrs[i]
295290 key_len = key_lengths[i]
296291 type_code = type_codes[i]
297-
292+
298293 value_ptr = find_key_value(line_start, line_len, key_ptr, key_len, & value_start, & value_len)
299-
294+
300295 if value_ptr == NULL :
301296 col_list.append(None )
302297 continue
303-
298+
304299 if type_code == COL_BOOL:
305300 if value_len == 4 and memcmp(value_ptr, b" true" , 4 ) == 0 :
306301 col_list.append(True )
307302 elif value_len == 5 and memcmp(value_ptr, b" false" , 5 ) == 0 :
308303 col_list.append(False )
309304 else :
310305 col_list.append(None )
311-
306+
312307 elif type_code == COL_INT:
313308 if value_len == 4 and memcmp(value_ptr, b" null" , 4 ) == 0 :
314309 col_list.append(None )
@@ -318,7 +313,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
318313 col_list.append(int (value_bytes))
319314 except ValueError :
320315 col_list.append(None )
321-
316+
322317 elif type_code == COL_FLOAT:
323318 if value_len == 4 and memcmp(value_ptr, b" null" , 4 ) == 0 :
324319 col_list.append(None )
@@ -328,7 +323,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
328323 col_list.append(float (value_bytes))
329324 except ValueError :
330325 col_list.append(None )
331-
326+
332327 elif type_code == COL_STR:
333328 if value_len == 4 and memcmp(value_ptr, b" null" , 4 ) == 0 :
334329 col_list.append(None )
@@ -342,7 +337,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
342337 col_list.append(None )
343338 else :
344339 col_list.append(None )
345-
340+
346341 else :
347342 if value_len == 4 and memcmp(value_ptr, b" null" , 4 ) == 0 :
348343 col_list.append(None )
@@ -356,7 +351,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
356351 col_list.append(parsed)
357352 except (json.JSONDecodeError, UnicodeDecodeError ):
358353 col_list.append(None )
359-
354+
360355 return (num_lines, num_cols, result)
361356 finally :
362357 if type_codes != NULL :
0 commit comments