Merge pull request #2871 from mabel-dev/clickbench-performance-regression-investigation-1

joocer · web-flow · commit 82ce1116e86a · 2025-10-26T15:11:58.000Z
Clickbench performance regression investigation 1
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1698
+__build__ = 1703
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1698"
+__version__ = "0.26.0-beta.1703"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/connectors/disk_connector.py b/opteryx/connectors/disk_connector.py
@@ -9,7 +9,6 @@
 """
 
 import contextlib
-import ctypes
 import mmap
 import os
 import platform
@@ -31,31 +30,21 @@
 from opteryx.exceptions import DatasetNotFoundError
 from opteryx.exceptions import EmptyDatasetError
 from opteryx.exceptions import UnsupportedFileTypeError
-from opteryx.utils import is_windows
 from opteryx.utils.file_decoders import TUPLE_OF_VALID_EXTENSIONS
 from opteryx.utils.file_decoders import get_decoder
 
 OS_SEP = os.sep
-IS_WINDOWS = is_windows()
 IS_LINUX = platform.system() == "Linux"
 
-# Define os.O_BINARY for non-Windows platforms if it's not already defined
-if not hasattr(os, "O_BINARY"):
-    os.O_BINARY = 0  # Value has no effect on non-Windows platforms
-if not hasattr(os, "O_DIRECT"):
-    os.O_DIRECT = 0  # Value has no effect on non-Windows platforms
 
+# prefer MAP_PRIVATE and on Linux enable MAP_POPULATE to fault pages in
+flags = mmap.MAP_PRIVATE
+if IS_LINUX:
+    with contextlib.suppress(Exception):
+        flags |= getattr(mmap, "MAP_POPULATE", 0)
 mmap_config = {}
-if not IS_WINDOWS:
-    # prefer MAP_PRIVATE and on Linux enable MAP_POPULATE to fault pages in
-    flags = mmap.MAP_PRIVATE
-    if IS_LINUX and hasattr(mmap, "MAP_POPULATE"):
-        with contextlib.suppress(Exception):
-            flags |= mmap.MAP_POPULATE
-    mmap_config["flags"] = flags
-    mmap_config["prot"] = mmap.PROT_READ
-else:
-    mmap_config["access"] = mmap.ACCESS_READ
+mmap_config["flags"] = flags
+mmap_config["prot"] = mmap.PROT_READ
 
 
 class DiskConnector(BaseConnector, Partitionable, PredicatePushable, LimitPushable, Statistics):
@@ -138,31 +127,73 @@ def read_blob(
             OSError:
                 If an I/O error occurs while reading the file.
         """
-        try:
-            file_descriptor = os.open(blob_name, os.O_RDONLY | os.O_BINARY)
-            if hasattr(os, "posix_fadvise"):
-                os.posix_fadvise(file_descriptor, 0, 0, os.POSIX_FADV_WILLNEED)
-            size = os.fstat(file_descriptor).st_size
-            _map = mmap.mmap(file_descriptor, length=size, **mmap_config)
-            result = decoder(
-                _map,
-                just_schema=just_schema,
-                projection=projection,
-                selection=selection,
-                use_threads=True,
-            )
-            self.statistics.bytes_read += size
+        # Hybrid strategy: choose mmap or read+memoryview depending on OS
+        # macOS -> mmap, Linux -> read.
+
+        # helper to use mmap path
+        def _use_mmap():
+            fd = os.open(blob_name, os.O_RDONLY)
+            try:
+                if hasattr(os, "posix_fadvise"):
+                    with contextlib.suppress(Exception):
+                        os.posix_fadvise(fd, 0, 0, os.POSIX_FADV_WILLNEED)
+                size = os.fstat(fd).st_size
+                _map = mmap.mmap(fd, length=size, **mmap_config)
+                result = decoder(
+                    _map,
+                    just_schema=just_schema,
+                    projection=projection,
+                    selection=selection,
+                    use_threads=True,
+                )
+
+                self.statistics.bytes_read += size
 
-            if not just_schema:
-                stats = self.read_blob_statistics(
-                    blob_name=blob_name, blob_bytes=_map, decoder=decoder
+                if not just_schema:
+                    stats = self.read_blob_statistics(
+                        blob_name=blob_name, blob_bytes=_map, decoder=decoder
+                    )
+                    if self.relation_statistics is None:
+                        self.relation_statistics = stats
+
+                return result
+            finally:
+                os.close(fd)
+
+        # helper to use read()+memoryview path
+        def _use_read():
+            with open(blob_name, "rb") as f:
+                if hasattr(os, "posix_fadvise"):
+                    with contextlib.suppress(Exception):
+                        os.posix_fadvise(f.fileno(), 0, 0, os.POSIX_FADV_WILLNEED)
+
+                data = f.read()
+                size = len(data)
+                buf = memoryview(data)
+
+                result = decoder(
+                    buf,
+                    just_schema=just_schema,
+                    projection=projection,
+                    selection=selection,
+                    use_threads=True,
                 )
-                if self.relation_statistics is None:
-                    self.relation_statistics = stats
 
-            return result
-        finally:
-            os.close(file_descriptor)
+                self.statistics.bytes_read += size
+
+                if not just_schema:
+                    stats = self.read_blob_statistics(
+                        blob_name=blob_name, blob_bytes=buf, decoder=decoder
+                    )
+                    if self.relation_statistics is None:
+                        self.relation_statistics = stats
+
+                return result
+
+        # macOS: use mmap; Linux: prefer read (observed faster on some Linux setups)
+        if platform.system() == "Darwin":
+            return _use_mmap()
+        return _use_read()
 
     @single_item_cache
     def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py
@@ -243,7 +243,7 @@ def parquet_decoder(
     just_schema: bool = False,
     just_statistics: bool = False,
     force_read: bool = False,
-    use_threads: bool = False,
+    use_threads: bool = True,
     statistics: Optional[RelationStatistics] = None,
 ) -> Tuple[int, int, pyarrow.Table]:
     """
@@ -334,21 +334,28 @@ def parquet_decoder(
 
         return statistics
 
-    # If we're here, we can't use rugo - we need to read the file with pyarrow
-
-    # Open the parquet file only once. Prefer pyarrow.BufferReader with a
-    # pyarrow.Buffer when we have a memoryview to avoid creating intermediate
-    # Python bytes objects.
+    # Use rugo's lightweight metadata reader first (faster than pyarrow)
     if isinstance(buffer, memoryview):
-        pa_buf = pyarrow.py_buffer(buffer)
-        stream = pyarrow.BufferReader(pa_buf)
-    elif isinstance(buffer, bytes):
-        stream = pyarrow.BufferReader(buffer)
+        rmeta = parquet_meta.read_metadata_from_memoryview(buffer)
     else:
-        stream = pyarrow.input_stream(buffer)
+        rmeta = parquet_meta.read_metadata_from_memoryview(memoryview(buffer))
 
-    pq_meta = parquet.read_metadata(stream)
-    stream.seek(0)
+    # Build the pieces we need from the rugo metadata
+    # schema names (parquet has same columns across row groups usually)
+    if rmeta.get("row_groups"):
+        schema_names = [c["name"] for c in rmeta["row_groups"][0]["columns"]]
+    else:
+        schema_names = []
+
+    num_rows = rmeta.get("num_rows")
+    # number of columns - try to derive, fallback to length of schema_names
+    num_columns = rmeta.get("num_columns") or len(schema_names)
+
+    # total uncompressed size (rugo uses total_byte_size)
+    uncompressed_size = sum(
+        sum(col.get("total_byte_size", 0) for col in rg.get("columns", []))
+        for rg in rmeta.get("row_groups", [])
+    )
 
     # we need to work out if we have a selection which may force us
     # fetching columns just for filtering
@@ -361,36 +368,21 @@ def parquet_decoder(
     filter_columns = {
         c.value for c in get_all_nodes_of_type(processed_selection, (NodeType.IDENTIFIER,))
     }
-    selected_columns = list(projection_set.union(filter_columns).intersection(pq_meta.schema.names))
+    selected_columns = list(projection_set.union(filter_columns).intersection(schema_names))
 
     # Read all columns if none are selected, unless force_read is set
     if not selected_columns and not force_read:
         selected_columns = []
 
-    # get the full data size of the file to see how effective projection/selection is
-    uncompressed_size = sum(
-        row_group.column(j).total_uncompressed_size
-        for i in range(pq_meta.num_row_groups)
-        for row_group in [pq_meta.row_group(i)]
-        for j in range(row_group.num_columns)
-    )
-
-    # If it's COUNT(*), we don't need to create a full dataset
-    # We have a handler later to sum up the $COUNT(*) column
-    if projection == [] and selection == []:
-        table = pyarrow.Table.from_arrays([[pq_meta.num_rows]], names=["$COUNT(*)"])
-        return (
-            pq_meta.num_rows,
-            pq_meta.num_columns,
-            uncompressed_size,
-            table,
-        )
+    # Open the parquet file only once. Fake a file-like object around the buffer
+    if isinstance(buffer, memoryview):
+        buffer = MemoryViewStream(buffer)
 
     # Read the parquet table with the optimized column list and selection filters
     table = parquet.read_table(
-        stream,
+        buffer,
         columns=selected_columns,
-        pre_buffer=False,
+        pre_buffer=True,
         filters=dnf_filter,
         use_threads=use_threads,
         use_pandas_metadata=False,
@@ -401,8 +393,8 @@ def parquet_decoder(
         table = filter_records(processed_selection, table)
 
     return (
-        pq_meta.num_rows,
-        pq_meta.num_columns,
+        num_rows,
+        num_columns,
         uncompressed_size,
         table,
     )
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1698"
+version = "0.26.0-beta.1703"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/setup.py b/setup.py