Skip to content

Commit 3833757

Browse files
committed
improve performance
1 parent 275cbb1 commit 3833757

File tree

12 files changed

+111
-56
lines changed

12 files changed

+111
-56
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ help: ## Show this help message
5151
lint: ## Run all linting tools
5252
$(call print_blue,"Installing linting tools...")
5353
@$(PIP) install --quiet --upgrade pycln isort ruff yamllint cython-lint
54+
$(call print_blue,"Removing whitespace in pyx files...")
55+
@$(PYTHON) dev/fix_cython_whitespace.py
5456
$(call print_blue,"Running Cython lint...")
5557
@cython-lint $(SRC_DIR)/compiled/**/*.pyx || true
5658
$(call print_blue,"Running Ruff checks...")

dev/fix_cython_whitespace.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Fix whitespace issues in Cython files.
4+
Removes trailing whitespace from blank lines (W293 errors).
5+
"""
6+
from pathlib import Path
7+
8+
9+
def fix_whitespace_in_file(filepath: Path) -> bool:
10+
"""
11+
Fix whitespace issues in a single file.
12+
13+
Returns:
14+
bool: True if file was modified, False otherwise
15+
"""
16+
try:
17+
with open(filepath, 'r', encoding='utf-8') as f:
18+
original_content = f.read()
19+
20+
# Remove trailing whitespace from all lines
21+
# This fixes W293 (blank line contains whitespace) and similar issues
22+
lines = original_content.splitlines(keepends=True)
23+
fixed_lines = [line.rstrip() + ('\n' if line.endswith('\n') else '') for line in lines]
24+
fixed_content = ''.join(fixed_lines)
25+
26+
# Remove trailing newline at end of file if present
27+
fixed_content = fixed_content.rstrip() + '\n'
28+
29+
if fixed_content != original_content:
30+
with open(filepath, 'w', encoding='utf-8') as f:
31+
f.write(fixed_content)
32+
return True
33+
return False
34+
except (OSError, UnicodeDecodeError) as e:
35+
print(f"Error processing {filepath}: {e}")
36+
return False
37+
38+
39+
def main():
40+
"""Fix whitespace in all Cython files."""
41+
root = Path(__file__).parent.parent
42+
print(f"Scanning for .pyx files in {root}")
43+
pyx_files = list(root.rglob('opteryx/**/*.pyx'))
44+
45+
if not pyx_files:
46+
print("No .pyx files found")
47+
return
48+
49+
print(f"Found {len(pyx_files)} .pyx files")
50+
modified_count = 0
51+
52+
for filepath in sorted(pyx_files):
53+
if fix_whitespace_in_file(filepath):
54+
print(f"Fixed: {filepath.relative_to(root)}")
55+
modified_count += 1
56+
57+
print(f"\nFixed {modified_count} file(s)")
58+
59+
60+
if __name__ == '__main__':
61+
main()

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1656
4+
__build__ = 1657
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1656"
6+
__version__ = "0.26.0-beta.1657"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/compiled/structures/jsonl_decoder.pyx

Lines changed: 36 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,11 @@
1010
Fast JSONL decoder using Cython for performance-critical operations.
1111
"""
1212

13-
from libc.string cimport memchr, strlen, strstr, memcmp
14-
from libc.stdlib cimport strtod, strtol, atoi
13+
from libc.string cimport memchr, memcmp
1514
from libc.stddef cimport size_t
1615
from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_GET_SIZE
17-
from libc.stdint cimport int64_t
1816
from cpython.mem cimport PyMem_Malloc, PyMem_Free
1917

20-
import pyarrow
2118
import json
2219

2320

@@ -48,7 +45,7 @@ cdef inline int _column_type_code(str col_type):
4845
cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, const char* key, Py_ssize_t key_len, Py_ssize_t* value_start, Py_ssize_t* value_len):
4946
"""
5047
Find the value for a given key in a JSON line.
51-
48+
5249
Returns pointer to value start, or NULL if not found.
5350
Updates value_start and value_len with the position and length.
5451
"""
@@ -63,7 +60,7 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
6360
cdef int bracket_count
6461
cdef int backslash_run
6562
cdef Py_ssize_t remaining
66-
63+
6764
# Search for the key pattern: "key":
6865
while pos < end:
6966
# Find opening quote of a key
@@ -73,27 +70,25 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
7370
key_pos = <const char*>memchr(pos, b'"', <size_t>remaining)
7471
if key_pos == NULL:
7572
return NULL
76-
73+
7774
key_pos += 1 # Move past the opening quote
78-
75+
7976
# Check if this matches our key
80-
if (end - key_pos >= key_len and
81-
memcmp(key_pos, key, <size_t>key_len) == 0 and
82-
key_pos[key_len] == b'"'):
83-
77+
if (end - key_pos >= key_len and memcmp(key_pos, key, <size_t>key_len) == 0 and key_pos[key_len] == b'"'):
78+
8479
# Found the key, now find the colon
8580
value_pos = key_pos + key_len + 1 # Skip closing quote
86-
81+
8782
# Skip whitespace and colon
8883
while value_pos < end and (value_pos[0] == b' ' or value_pos[0] == b'\t' or value_pos[0] == b':'):
8984
value_pos += 1
90-
85+
9186
if value_pos >= end:
9287
return NULL
93-
88+
9489
first_char = value_pos[0]
9590
value_start[0] = value_pos - line
96-
91+
9792
# Determine value type and find end
9893
if first_char == b'"':
9994
# String value - find closing quote, handling escapes
@@ -111,7 +106,7 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
111106
backslash_run = 0
112107
quote_end += 1
113108
return NULL
114-
109+
115110
elif first_char == b'{':
116111
# Object - count braces
117112
brace_count = 1
@@ -134,7 +129,7 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
134129
quote_end += 1
135130
value_len[0] = quote_end - value_pos
136131
return value_pos
137-
132+
138133
elif first_char == b'[':
139134
# Array - count brackets
140135
bracket_count = 1
@@ -157,28 +152,28 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
157152
quote_end += 1
158153
value_len[0] = quote_end - value_pos
159154
return value_pos
160-
155+
161156
elif first_char == b'n':
162157
# null
163158
if end - value_pos >= 4 and memcmp(value_pos, b"null", 4) == 0:
164159
value_len[0] = 4
165160
return value_pos
166161
return NULL
167-
162+
168163
elif first_char == b't':
169164
# true
170165
if end - value_pos >= 4 and memcmp(value_pos, b"true", 4) == 0:
171166
value_len[0] = 4
172167
return value_pos
173168
return NULL
174-
169+
175170
elif first_char == b'f':
176171
# false
177172
if end - value_pos >= 5 and memcmp(value_pos, b"false", 5) == 0:
178173
value_len[0] = 5
179174
return value_pos
180175
return NULL
181-
176+
182177
else:
183178
# Number - find end (space, comma, brace, bracket)
184179
quote_end = value_pos + 1
@@ -190,22 +185,22 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
190185
quote_end += 1
191186
value_len[0] = quote_end - value_pos
192187
return value_pos
193-
188+
194189
pos = key_pos
195-
190+
196191
return NULL
197192

198193

199194
cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_types, Py_ssize_t sample_size=100):
200195
"""
201196
Fast JSONL decoder that extracts values using C string operations.
202-
197+
203198
Parameters:
204199
buffer: bytes - The JSONL data
205200
column_names: list - List of column names to extract
206201
column_types: dict - Dictionary mapping column names to types ('bool', 'int', 'float', 'str', etc.)
207202
sample_size: int - Number of lines to use for schema inference (not used if column_types provided)
208-
203+
209204
Returns:
210205
tuple: (num_rows, num_cols, dict of column_name -> list of values)
211206
"""
@@ -239,9 +234,9 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
239234
cdef object parsed
240235
cdef Py_ssize_t remaining
241236
cdef const char* newline_pos
242-
237+
243238
result = {}
244-
239+
245240
if num_cols > 0:
246241
type_codes = <int*>PyMem_Malloc(num_cols * sizeof(int))
247242
key_ptrs = <const char**>PyMem_Malloc(num_cols * sizeof(const char*))
@@ -254,7 +249,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
254249
if key_lengths != NULL:
255250
PyMem_Free(key_lengths)
256251
raise MemoryError()
257-
252+
258253
try:
259254
for i in range(num_cols):
260255
col = column_names[i]
@@ -267,7 +262,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
267262
result[col] = col_list
268263
col_type = column_types.get(col, 'str')
269264
type_codes[i] = _column_type_code(col_type)
270-
265+
271266
while pos < end:
272267
line_start = pos
273268
remaining = end - line_start
@@ -280,35 +275,35 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
280275
else:
281276
line_end = newline_pos
282277
pos = newline_pos + 1
283-
278+
284279
line_len = line_end - line_start
285280
num_lines += 1
286-
281+
287282
if line_len == 0:
288283
for i in range(num_cols):
289284
(<list>column_lists[i]).append(None)
290285
continue
291-
286+
292287
for i in range(num_cols):
293288
col_list = <list>column_lists[i]
294289
key_ptr = key_ptrs[i]
295290
key_len = key_lengths[i]
296291
type_code = type_codes[i]
297-
292+
298293
value_ptr = find_key_value(line_start, line_len, key_ptr, key_len, &value_start, &value_len)
299-
294+
300295
if value_ptr == NULL:
301296
col_list.append(None)
302297
continue
303-
298+
304299
if type_code == COL_BOOL:
305300
if value_len == 4 and memcmp(value_ptr, b"true", 4) == 0:
306301
col_list.append(True)
307302
elif value_len == 5 and memcmp(value_ptr, b"false", 5) == 0:
308303
col_list.append(False)
309304
else:
310305
col_list.append(None)
311-
306+
312307
elif type_code == COL_INT:
313308
if value_len == 4 and memcmp(value_ptr, b"null", 4) == 0:
314309
col_list.append(None)
@@ -318,7 +313,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
318313
col_list.append(int(value_bytes))
319314
except ValueError:
320315
col_list.append(None)
321-
316+
322317
elif type_code == COL_FLOAT:
323318
if value_len == 4 and memcmp(value_ptr, b"null", 4) == 0:
324319
col_list.append(None)
@@ -328,7 +323,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
328323
col_list.append(float(value_bytes))
329324
except ValueError:
330325
col_list.append(None)
331-
326+
332327
elif type_code == COL_STR:
333328
if value_len == 4 and memcmp(value_ptr, b"null", 4) == 0:
334329
col_list.append(None)
@@ -342,7 +337,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
342337
col_list.append(None)
343338
else:
344339
col_list.append(None)
345-
340+
346341
else:
347342
if value_len == 4 and memcmp(value_ptr, b"null", 4) == 0:
348343
col_list.append(None)
@@ -356,7 +351,7 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
356351
col_list.append(parsed)
357352
except (json.JSONDecodeError, UnicodeDecodeError):
358353
col_list.append(None)
359-
354+
360355
return (num_lines, num_cols, result)
361356
finally:
362357
if type_codes != NULL:

opteryx/third_party/fuzzy/soundex.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ cpdef soundex(char* s):
5656
else:
5757
# Get the soundex code for this character
5858
code = soundex_map[c - 65]
59-
59+
6060
if code != 48: # Not a vowel/ignored letter
6161
# Only add if not the same as previous soundex code
6262
if code != prev_code:

opteryx/third_party/ulfjack/ryu.pyx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,15 @@ cdef inline int trim_trailing_zeros(char* buf, int length) nogil:
3535

3636
cdef inline bint is_safe_double(double d) nogil:
3737
"""Check if a double value is safe to pass to ryu"""
38-
return (isfinite(d) and
39-
d <= MAX_SAFE_DOUBLE and
38+
return (isfinite(d) and
39+
d <= MAX_SAFE_DOUBLE and
4040
d >= MIN_SAFE_DOUBLE)
4141

4242
cdef inline bytes safe_double_to_bytes(double d, uint32_t precision):
4343
"""Safely convert a double to bytes, handling extreme values"""
4444
cdef char buf[32]
4545
cdef int length
46-
46+
4747
if not is_safe_double(d):
4848
if isnan(d):
4949
return b"NaN"
@@ -55,7 +55,7 @@ cdef inline bytes safe_double_to_bytes(double d, uint32_t precision):
5555
else:
5656
# For extreme finite values, fall back to Python string conversion
5757
return str(d).encode('ascii')
58-
58+
5959
length = d2fixed_buffered_n(d, precision, buf)
6060
length = trim_trailing_zeros(buf, length)
6161
return <bytes>buf[:length]

0 commit comments

Comments
 (0)