Skip to content

Commit e512bbc

Browse files
authored
Merge pull request #2870 from mabel-dev/clickbench-performance-regression-investigation-1
jsonl reads draken
2 parents ea5e13e + adbe3df commit e512bbc

File tree

6 files changed

+17
-381
lines changed

6 files changed

+17
-381
lines changed

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1695
4+
__build__ = 1698
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1695"
6+
__version__ = "0.26.0-beta.1698"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/connectors/disk_connector.py

Lines changed: 4 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -138,40 +138,14 @@ def read_blob(
138138
OSError:
139139
If an I/O error occurs while reading the file.
140140
"""
141-
file_descriptor = None
142-
_map = None
143141
try:
144142
file_descriptor = os.open(blob_name, os.O_RDONLY | os.O_BINARY)
145-
# on platforms that support it give the kernel a hint about access pattern
146143
if hasattr(os, "posix_fadvise"):
147-
# sequential access is the common pattern for dataset reads
148-
try:
149-
os.posix_fadvise(file_descriptor, 0, 0, os.POSIX_FADV_SEQUENTIAL)
150-
except OSError:
151-
# fallback to WILLNEED if SEQUENTIAL is not allowed
152-
with contextlib.suppress(Exception):
153-
os.posix_fadvise(file_descriptor, 0, 0, os.POSIX_FADV_WILLNEED)
154-
144+
os.posix_fadvise(file_descriptor, 0, 0, os.POSIX_FADV_WILLNEED)
155145
size = os.fstat(file_descriptor).st_size
156146
_map = mmap.mmap(file_descriptor, length=size, **mmap_config)
157-
158-
# On Linux advise the kernel that access will be sequential to improve readahead
159-
if IS_LINUX:
160-
# if anything goes wrong, ignore
161-
with contextlib.suppress(Exception):
162-
libc = ctypes.CDLL("libc.so.6")
163-
# MADV_SEQUENTIAL is 2 on Linux, but don't hardcode if available
164-
MADV_SEQUENTIAL = 2
165-
addr = ctypes.c_void_p(ctypes.addressof(ctypes.c_char.from_buffer(_map)))
166-
length = ctypes.c_size_t(size)
167-
libc.madvise(addr, length, MADV_SEQUENTIAL)
168-
169-
# pass a memoryview of the mmap to decoders - this makes intent explicit
170-
# and lets decoders that can accept memoryviews avoid extra copies
171-
buffer = memoryview(_map)
172-
173147
result = decoder(
174-
buffer,
148+
_map,
175149
just_schema=just_schema,
176150
projection=projection,
177151
selection=selection,
@@ -181,20 +155,14 @@ def read_blob(
181155

182156
if not just_schema:
183157
stats = self.read_blob_statistics(
184-
blob_name=blob_name, blob_bytes=buffer, decoder=decoder
158+
blob_name=blob_name, blob_bytes=_map, decoder=decoder
185159
)
186160
if self.relation_statistics is None:
187161
self.relation_statistics = stats
188162

189163
return result
190164
finally:
191-
# Ensure mmap is closed before closing the file descriptor
192-
with contextlib.suppress(Exception):
193-
if _map is not None:
194-
_map.close()
195-
with contextlib.suppress(Exception):
196-
if file_descriptor is not None:
197-
os.close(file_descriptor)
165+
os.close(file_descriptor)
198166

199167
@single_item_cache
200168
def get_list_of_blob_names(self, *, prefix: str) -> List[str]:

opteryx/utils/file_decoders.py

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -509,30 +509,14 @@ def jsonl_decoder(
509509

510510
for idx, name in enumerate(table["column_names"]):
511511
field = arrow_schema.field(name)
512-
col = table["columns"][idx]
513-
try:
514-
# First attempt to build the array using the declared type.
515-
arr = pyarrow.array(col, type=field.type)
516-
except Exception:
517-
# If that fails, infer the best array type from the data.
518-
print(
519-
f"Warning: could not convert column '{name}' to type {field.type}, inferring type."
520-
)
521-
print(f"Data sample: {col[:5]}")
522-
print(set(type(t) for t in col))
523-
arr = pyarrow.array(col)
524-
525-
arrays.append(arr)
526-
527-
# If inference produced a different type (e.g. list<...> instead of
528-
# binary) use that type in the final schema so Table construction
529-
# doesn't try to coerce incompatible arrays and raise errors like
530-
# "Expected bytes, got list". We deliberately do not coerce lists
531-
# into strings here — leave string columns alone as requested.
532-
if arr.type != field.type:
533-
final_fields.append(pyarrow.field(field.name, arr.type))
512+
column = table["columns"][idx]
513+
if hasattr(column, "to_arrow"):
514+
# rugo returns draken vectors; convert to pyarrow arrays
515+
arrays.append(column.to_arrow())
534516
else:
535-
final_fields.append(field)
517+
# fallback: convert using pyarrow array constructor
518+
arrays.append(pyarrow.array(column, type=field.type))
519+
final_fields.append(field)
536520

537521
final_schema = pyarrow.schema(final_fields)
538522
arrow_table = pyarrow.Table.from_arrays(arrays, schema=final_schema)

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.0-beta.1695"
3+
version = "0.26.0-beta.1698"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}
77
authors = [{name='Justin Joyce', email='[email protected]'}]
88
maintainers = [{name='Justin Joyce', email='[email protected]'}]
9-
dependencies = ['aiohttp', 'numpy>=2.0.0', 'orjson', 'orso>=0.0.204', 'psutil', 'pyarrow>=20.0.0', 'requests', 'rugo>=0.1.19']
9+
dependencies = ['aiohttp', 'draken', 'numpy>=2.0.0', 'orjson', 'orso>=0.0.204', 'psutil', 'pyarrow>=20.0.0', 'requests', 'rugo>=0.1.19']
1010

1111
[project.scripts]
1212
opteryx = "opteryx.command:main"

tests/integration/sql_battery/test_shapes_edge_cases.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,8 @@ def test_sql_battery(statement:str, rows:int, columns:int, exception: Optional[E
611611
print(f"\033[0;31m{str(int((time.monotonic_ns() - start)/1e6)).rjust(4)}ms ❌ {failed}\033[0m")
612612
print(">", err)
613613
failures.append((statement, err))
614+
if not isinstance(err, AssertionError):
615+
raise err
614616

615617
print("--- ✅ \033[0;32mdone\033[0m")
616618

0 commit comments

Comments
 (0)