Merge pull request #2870 from mabel-dev/clickbench-performance-regression-investigation-1

joocer · web-flow · commit e512bbc7753a · 2025-10-26T08:30:24.000Z
jsonl reads draken
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1695
+__build__ = 1698
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1695"
+__version__ = "0.26.0-beta.1698"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/connectors/disk_connector.py b/opteryx/connectors/disk_connector.py
@@ -138,40 +138,14 @@ def read_blob(
             OSError:
                 If an I/O error occurs while reading the file.
         """
-        file_descriptor = None
-        _map = None
         try:
             file_descriptor = os.open(blob_name, os.O_RDONLY | os.O_BINARY)
-            # on platforms that support it give the kernel a hint about access pattern
             if hasattr(os, "posix_fadvise"):
-                # sequential access is the common pattern for dataset reads
-                try:
-                    os.posix_fadvise(file_descriptor, 0, 0, os.POSIX_FADV_SEQUENTIAL)
-                except OSError:
-                    # fallback to WILLNEED if SEQUENTIAL is not allowed
-                    with contextlib.suppress(Exception):
-                        os.posix_fadvise(file_descriptor, 0, 0, os.POSIX_FADV_WILLNEED)
-
+                os.posix_fadvise(file_descriptor, 0, 0, os.POSIX_FADV_WILLNEED)
             size = os.fstat(file_descriptor).st_size
             _map = mmap.mmap(file_descriptor, length=size, **mmap_config)
-
-            # On Linux advise the kernel that access will be sequential to improve readahead
-            if IS_LINUX:
-                # if anything goes wrong, ignore
-                with contextlib.suppress(Exception):
-                    libc = ctypes.CDLL("libc.so.6")
-                    # MADV_SEQUENTIAL is 2 on Linux, but don't hardcode if available
-                    MADV_SEQUENTIAL = 2
-                    addr = ctypes.c_void_p(ctypes.addressof(ctypes.c_char.from_buffer(_map)))
-                    length = ctypes.c_size_t(size)
-                    libc.madvise(addr, length, MADV_SEQUENTIAL)
-
-            # pass a memoryview of the mmap to decoders - this makes intent explicit
-            # and lets decoders that can accept memoryviews avoid extra copies
-            buffer = memoryview(_map)
-
             result = decoder(
-                buffer,
+                _map,
                 just_schema=just_schema,
                 projection=projection,
                 selection=selection,
@@ -181,20 +155,14 @@ def read_blob(
 
             if not just_schema:
                 stats = self.read_blob_statistics(
-                    blob_name=blob_name, blob_bytes=buffer, decoder=decoder
+                    blob_name=blob_name, blob_bytes=_map, decoder=decoder
                 )
                 if self.relation_statistics is None:
                     self.relation_statistics = stats
 
             return result
         finally:
-            # Ensure mmap is closed before closing the file descriptor
-            with contextlib.suppress(Exception):
-                if _map is not None:
-                    _map.close()
-            with contextlib.suppress(Exception):
-                if file_descriptor is not None:
-                    os.close(file_descriptor)
+            os.close(file_descriptor)
 
     @single_item_cache
     def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py
@@ -509,30 +509,14 @@ def jsonl_decoder(
 
     for idx, name in enumerate(table["column_names"]):
         field = arrow_schema.field(name)
-        col = table["columns"][idx]
-        try:
-            # First attempt to build the array using the declared type.
-            arr = pyarrow.array(col, type=field.type)
-        except Exception:
-            # If that fails, infer the best array type from the data.
-            print(
-                f"Warning: could not convert column '{name}' to type {field.type}, inferring type."
-            )
-            print(f"Data sample: {col[:5]}")
-            print(set(type(t) for t in col))
-            arr = pyarrow.array(col)
-
-        arrays.append(arr)
-
-        # If inference produced a different type (e.g. list<...> instead of
-        # binary) use that type in the final schema so Table construction
-        # doesn't try to coerce incompatible arrays and raise errors like
-        # "Expected bytes, got list". We deliberately do not coerce lists
-        # into strings here — leave string columns alone as requested.
-        if arr.type != field.type:
-            final_fields.append(pyarrow.field(field.name, arr.type))
+        column = table["columns"][idx]
+        if hasattr(column, "to_arrow"):
+            # rugo returns draken vectors; convert to pyarrow arrays
+            arrays.append(column.to_arrow())
         else:
-            final_fields.append(field)
+            # fallback: convert using pyarrow array constructor
+            arrays.append(pyarrow.array(column, type=field.type))
+        final_fields.append(field)
 
     final_schema = pyarrow.schema(final_fields)
     arrow_table = pyarrow.Table.from_arrays(arrays, schema=final_schema)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,12 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1695"
+version = "0.26.0-beta.1698"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
 authors = [{name='Justin Joyce', email='justin.joyce@joocer.com'}]
 maintainers = [{name='Justin Joyce', email='justin.joyce@joocer.com'}]
-dependencies = ['aiohttp', 'numpy>=2.0.0', 'orjson', 'orso>=0.0.204', 'psutil', 'pyarrow>=20.0.0', 'requests', 'rugo>=0.1.19']
+dependencies = ['aiohttp', 'draken', 'numpy>=2.0.0', 'orjson', 'orso>=0.0.204', 'psutil', 'pyarrow>=20.0.0', 'requests', 'rugo>=0.1.19']
 
 [project.scripts]
 opteryx = "opteryx.command:main"
diff --git a/tests/integration/sql_battery/test_shapes_edge_cases.py b/tests/integration/sql_battery/test_shapes_edge_cases.py
@@ -611,6 +611,8 @@ def test_sql_battery(statement:str, rows:int, columns:int, exception: Optional[E
             print(f"\033[0;31m{str(int((time.monotonic_ns() - start)/1e6)).rjust(4)}ms ❌ {failed}\033[0m")
             print(">", err)
             failures.append((statement, err))
+            if not isinstance(err, AssertionError):
+                raise err
 
     print("--- ✅ \033[0;32mdone\033[0m")
 
diff --git a/tests/performance/benchmarks/bench_jsonl.py b/tests/performance/benchmarks/bench_jsonl.py