Skip to content

Commit f40c9f7

Browse files
committed
improve jsonl performance
1 parent be49a57 commit f40c9f7

File tree

6 files changed

+244
-56
lines changed

6 files changed

+244
-56
lines changed

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1660
4+
__build__ = 1661
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1660"
6+
__version__ = "0.26.0-beta.1661"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/compiled/structures/jsonl_decoder.pyx

Lines changed: 88 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,22 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
1818
from opteryx.third_party.fastfloat.fast_float cimport c_parse_fast_float
1919

2020
import orjson as json
21+
import platform
22+
23+
cdef extern from "simd_search.h":
24+
size_t neon_count(const char* data, size_t length, char target)
25+
size_t avx_count(const char* data, size_t length, char target)
26+
27+
28+
# Detect architecture at module initialization and select the appropriate SIMD function
29+
cdef size_t (*simd_count)(const char*, size_t, char)
30+
31+
# Detect CPU architecture once at module load
32+
_arch = platform.machine().lower()
33+
if _arch in ('arm64', 'aarch64'):
34+
simd_count = neon_count
35+
else:
36+
simd_count = avx_count
2137

2238

2339
cdef enum ColumnType:
@@ -107,20 +123,19 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
107123
remaining = end - pos
108124
if remaining <= 0:
109125
return NULL
110-
key_pos = <const char*>memchr(pos, b'"', <size_t>remaining)
126+
key_pos = <const char*>memchr(pos, 34, <size_t>remaining) # '"'
111127
if key_pos == NULL:
112128
return NULL
113129

114130
key_pos += 1 # Move past the opening quote
115131

116132
# Check if this matches our key
117-
if (end - key_pos >= key_len and memcmp(key_pos, key, <size_t>key_len) == 0 and key_pos[key_len] == b'"'):
118-
133+
if (end - key_pos >= key_len and memcmp(key_pos, key, <size_t>key_len) == 0 and key_pos[key_len] == 34): # '"'
119134
# Found the key, now find the colon
120135
value_pos = key_pos + key_len + 1 # Skip closing quote
121136

122137
# Skip whitespace and colon
123-
while value_pos < end and (value_pos[0] == b' ' or value_pos[0] == b'\t' or value_pos[0] == b':'):
138+
while value_pos < end and (value_pos[0] == 32 or value_pos[0] == 9 or value_pos[0] == 58): # ' ', '\t', ':'
124139
value_pos += 1
125140

126141
if value_pos >= end:
@@ -130,84 +145,84 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
130145
value_start[0] = value_pos - line
131146

132147
# Determine value type and find end
133-
if first_char == b'"':
148+
if first_char == 34: # '"'
134149
# String value - find closing quote, handling escapes
135150
quote_start = value_pos + 1
136151
quote_end = quote_start
137152
backslash_run = 0
138153
while quote_end < end:
139-
if quote_end[0] == b'\\':
154+
if quote_end[0] == 92: # '\\'
140155
backslash_run += 1
141156
else:
142-
if quote_end[0] == b'"' and (backslash_run & 1) == 0:
157+
if quote_end[0] == 34 and (backslash_run & 1) == 0: # '"'
143158
# Found unescaped quote
144159
value_len[0] = (quote_end + 1) - value_pos
145160
return value_pos
146161
backslash_run = 0
147162
quote_end += 1
148163
return NULL
149164

150-
elif first_char == b'{':
165+
elif first_char == 123: # '{'
151166
# Object - count braces
152167
brace_count = 1
153168
quote_end = value_pos + 1
154169
while quote_end < end and brace_count > 0:
155-
if quote_end[0] == b'{':
170+
if quote_end[0] == 123: # '{'
156171
brace_count += 1
157-
elif quote_end[0] == b'}':
172+
elif quote_end[0] == 125: # '}'
158173
brace_count -= 1
159-
elif quote_end[0] == b'"':
174+
elif quote_end[0] == 34: # '"'
160175
# Skip string contents to avoid premature brace counting
161176
quote_end += 1
162177
while quote_end < end:
163-
if quote_end[0] == b'\\':
178+
if quote_end[0] == 92: # '\'
164179
quote_end += 2
165180
continue
166-
if quote_end[0] == b'"':
181+
if quote_end[0] == 34: # '"'
167182
break
168183
quote_end += 1
169184
quote_end += 1
170185
value_len[0] = quote_end - value_pos
171186
return value_pos
172187

173-
elif first_char == b'[':
188+
elif first_char == 91: # '['
174189
# Array - count brackets
175190
bracket_count = 1
176191
quote_end = value_pos + 1
177192
while quote_end < end and bracket_count > 0:
178-
if quote_end[0] == b'[':
193+
if quote_end[0] == 91: # '['
179194
bracket_count += 1
180-
elif quote_end[0] == b']':
195+
elif quote_end[0] == 93: # ']'
181196
bracket_count -= 1
182-
elif quote_end[0] == b'"':
197+
elif quote_end[0] == 34: # '"'
183198
# Skip string contents inside arrays
184199
quote_end += 1
185200
while quote_end < end:
186-
if quote_end[0] == b'\\':
201+
if quote_end[0] == 92:
187202
quote_end += 2
188203
continue
189-
if quote_end[0] == b'"':
204+
if quote_end[0] == 34: # '"'
190205
break
191206
quote_end += 1
192207
quote_end += 1
193208
value_len[0] = quote_end - value_pos
194209
return value_pos
195210

196-
elif first_char == b'n':
211+
elif first_char == 110: # 'n'
197212
# null
198213
if end - value_pos >= 4 and memcmp(value_pos, b"null", 4) == 0:
199214
value_len[0] = 4
200215
return value_pos
201216
return NULL
202217

203-
elif first_char == b't':
218+
elif first_char == 116: # 't'
204219
# true
205220
if end - value_pos >= 4 and memcmp(value_pos, b"true", 4) == 0:
206221
value_len[0] = 4
207222
return value_pos
208223
return NULL
209224

210-
elif first_char == b'f':
225+
elif first_char == 102: # 'f'
211226
# false
212227
if end - value_pos >= 5 and memcmp(value_pos, b"false", 5) == 0:
213228
value_len[0] = 5
@@ -219,7 +234,7 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
219234
quote_end = value_pos + 1
220235
while quote_end < end:
221236
# Check for delimiter characters
222-
if quote_end[0] == b' ' or quote_end[0] == b',' or quote_end[0] == b'}' or quote_end[0] == b']' or quote_end[0] == b'\t' or quote_end[0] == b'\n':
237+
if quote_end[0] == 32 or quote_end[0] == 44 or quote_end[0] == 125 or quote_end[0] == 93 or quote_end[0] == 9 or quote_end[0] == 10: # ' ', ',', '}', ']', '\t', '\n'
223238
value_len[0] = quote_end - value_pos
224239
return value_pos
225240
quote_end += 1
@@ -259,7 +274,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
259274
cdef Py_ssize_t key_len
260275
cdef str col_type
261276
cdef dict result = {}
262-
cdef Py_ssize_t num_lines = 0
277+
cdef Py_ssize_t num_lines
263278
cdef Py_ssize_t i
264279
cdef Py_ssize_t num_cols = len(column_names)
265280
cdef list column_lists = []
@@ -270,10 +285,12 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
270285
cdef int type_code
271286
cdef list col_list
272287
cdef bytes value_bytes
273-
cdef str value_str
274288
cdef object parsed
275289
cdef Py_ssize_t remaining
276290
cdef const char* newline_pos
291+
cdef size_t line_count
292+
cdef size_t estimated_lines
293+
cdef Py_ssize_t line_index = 0
277294

278295
result = {}
279296

@@ -303,12 +320,24 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
303320
col_type = column_types.get(col, 'str')
304321
type_codes[i] = _column_type_code(col_type)
305322

323+
# Pre-count lines to preallocate arrays (using architecture-appropriate SIMD function)
324+
line_count = simd_count(data, data_len, 10) # Count '\n' (ASCII 10)
325+
if data_len > 0 and data[data_len - 1] != 10: # Doesn't end with '\n'
326+
estimated_lines = line_count + 1
327+
else:
328+
estimated_lines = line_count
329+
330+
# Preallocate column lists
331+
for i in range(num_cols):
332+
col_list = column_lists[i]
333+
column_lists[i] = [None] * estimated_lines
334+
306335
while pos < end:
307336
line_start = pos
308337
remaining = end - line_start
309338
if remaining <= 0:
310339
break
311-
newline_pos = <const char*>memchr(line_start, b'\n', <size_t>remaining)
340+
newline_pos = <const char*>memchr(line_start, 10, <size_t>remaining) # '\n'
312341
if newline_pos == NULL:
313342
line_end = end
314343
pos = end
@@ -317,11 +346,11 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
317346
pos = newline_pos + 1
318347

319348
line_len = line_end - line_start
320-
num_lines += 1
349+
# num_lines += 1 # Removed, using line_index instead
321350

322351
if line_len == 0:
323-
for i in range(num_cols):
324-
(<list>column_lists[i]).append(None)
352+
# Already pre-filled with None
353+
line_index += 1
325354
continue
326355

327356
for i in range(num_cols):
@@ -333,58 +362,65 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
333362
value_ptr = find_key_value(line_start, line_len, key_ptr, key_len, &value_start, &value_len)
334363

335364
if value_ptr == NULL:
336-
col_list.append(None)
365+
# Already None
337366
continue
338367

339368
if type_code == COL_BOOL:
340369
if value_len == 4 and memcmp(value_ptr, b"true", 4) == 0:
341-
col_list.append(True)
370+
col_list[line_index] = True
342371
elif value_len == 5 and memcmp(value_ptr, b"false", 5) == 0:
343-
col_list.append(False)
344-
else:
345-
col_list.append(None)
372+
col_list[line_index] = False
373+
# else already None
346374

347375
elif type_code == COL_INT:
348376
if value_len == 4 and memcmp(value_ptr, b"null", 4) == 0:
349-
col_list.append(None)
377+
# Already None
378+
pass
350379
else:
351-
col_list.append(fast_atoll(value_ptr, value_len))
380+
col_list[line_index] = fast_atoll(value_ptr, value_len)
352381

353382
elif type_code == COL_FLOAT:
354383
if value_len == 4 and memcmp(value_ptr, b"null", 4) == 0:
355-
col_list.append(None)
384+
# Already None
385+
pass
356386
else:
357387
value_bytes = PyBytes_FromStringAndSize(value_ptr, value_len)
358-
col_list.append(c_parse_fast_float(value_bytes))
388+
col_list[line_index] = c_parse_fast_float(value_bytes)
359389

360390
elif type_code == COL_STR:
361391
if value_len == 4 and memcmp(value_ptr, b"null", 4) == 0:
362-
col_list.append(None)
363-
elif value_ptr[0] == b'"' and value_len >= 2:
364-
value_bytes = PyBytes_FromStringAndSize(value_ptr + 1, value_len - 2)
365-
try:
366-
value_str = value_bytes.decode('utf-8')
367-
value_str = value_str.replace('\\n', '\n').replace('\\t', '\t').replace('\\"', '"').replace('\\\\', '\\')
368-
col_list.append(value_str)
369-
except UnicodeDecodeError:
370-
col_list.append(None)
392+
# Already None
393+
pass
371394
else:
372-
col_list.append(None)
395+
value_bytes = PyBytes_FromStringAndSize(value_ptr, value_len)
396+
try:
397+
parsed = json.loads(value_bytes)
398+
if isinstance(parsed, str):
399+
col_list[line_index] = parsed
400+
# else already None
401+
except (json.JSONDecodeError, UnicodeDecodeError):
402+
# Already None
403+
pass
373404

374405
else:
375406
if value_len == 4 and memcmp(value_ptr, b"null", 4) == 0:
376-
col_list.append(None)
407+
# Already None
408+
pass
377409
else:
378410
value_bytes = PyBytes_FromStringAndSize(value_ptr, value_len)
379411
try:
380412
parsed = json.loads(value_bytes)
381413
if isinstance(parsed, dict):
382-
col_list.append(json.dumps(parsed))
414+
col_list[line_index] = json.dumps(parsed)
383415
else:
384-
col_list.append(parsed)
416+
col_list[line_index] = parsed
385417
except (json.JSONDecodeError, UnicodeDecodeError):
386-
col_list.append(None)
418+
# Already None
419+
pass
420+
421+
line_index += 1
387422

423+
num_lines = line_index
388424
return (num_lines, num_cols, result)
389425
finally:
390426
if type_codes != NULL:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.0-beta.1660"
3+
version = "0.26.0-beta.1661"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}

setup.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,10 @@ def rust_build(setup_kwargs: Dict[str, Any]) -> None:
251251
),
252252
Extension(
253253
name="opteryx.compiled.structures.jsonl_decoder",
254-
sources=["opteryx/compiled/structures/jsonl_decoder.pyx"],
254+
sources=[
255+
"opteryx/compiled/structures/jsonl_decoder.pyx",
256+
"src/cpp/simd_search.cpp"
257+
],
255258
include_dirs=include_dirs + ["third_party/fastfloat/fast_float"],
256259
language="c++",
257260
extra_compile_args=CPP_COMPILE_FLAGS,

0 commit comments

Comments
 (0)