Merge pull request #2879 from mabel-dev/clickbench-performance-regression-investigation-1

joocer · web-flow · commit 2ea26aa2292b · 2025-10-29T20:41:44.000Z
linux disk performance
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1707
+__build__ = 1710
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1707"
+__version__ = "0.26.0-beta.1710"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/connectors/disk_connector.py b/opteryx/connectors/disk_connector.py
@@ -8,10 +8,7 @@
 given as a folder on local disk
 """
 
-import contextlib
-import mmap
 import os
-import platform
 import time
 from typing import Dict
 from typing import List
@@ -34,17 +31,6 @@
 from opteryx.utils.file_decoders import get_decoder
 
 OS_SEP = os.sep
-IS_LINUX = platform.system() == "Linux"
-
-
-# prefer MAP_PRIVATE and on Linux enable MAP_POPULATE to fault pages in
-flags = mmap.MAP_PRIVATE
-if IS_LINUX:
-    with contextlib.suppress(Exception):
-        flags |= getattr(mmap, "MAP_POPULATE", 0)
-mmap_config = {}
-mmap_config["flags"] = flags
-mmap_config["prot"] = mmap.PROT_READ
 
 
 class DiskConnector(BaseConnector, Partitionable, PredicatePushable, LimitPushable, Statistics):
@@ -128,7 +114,7 @@ def read_blob(
                 If an I/O error occurs while reading the file.
         """
         from opteryx.compiled.io.disk_reader import read_file_mmap
-        from opteryx.compiled.io.disk_reader import unmap_memory
+        #from opteryx.compiled.io.disk_reader import unmap_memory
 
         # Read using mmap for maximum speed
         mmap_obj = read_file_mmap(blob_name)
@@ -157,7 +143,8 @@ def read_blob(
             return result
         finally:
             # CRITICAL: Clean up the memory mapping
-            unmap_memory(mmap_obj)
+            pass
+            # unmap_memory(mmap_obj)
 
     @single_item_cache
     def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
@@ -219,7 +206,7 @@ def read_dataset(
             decoder = get_decoder(blob_name)
             try:
                 if not just_schema:
-                    num_rows, _, raw_bytes, decoded = self.read_blob(
+                    num_rows, _, raw_size, decoded = self.read_blob(
                         blob_name=blob_name,
                         decoder=decoder,
                         just_schema=False,
@@ -234,8 +221,8 @@ def read_dataset(
 
                     self.statistics.rows_seen += num_rows
                     self.rows_seen += num_rows
-                    self.statistics.bytes_raw += raw_bytes
                     self.blobs_seen += 1
+                    self.statistics.bytes_raw += raw_size
                     yield decoded
 
                     # if we have read all the rows we need to stop
@@ -247,14 +234,9 @@ def read_dataset(
                         decoder=decoder,
                         just_schema=True,
                     )
-                    # Some decoders may return None for schema (e.g. unreadable
-                    # or undecidable schema). Skip those and continue with the
-                    # next blob instead of trying to access attributes on None.
-                    if schema is None:
-                        continue
                     # if we have more than one blob we need to estimate the row count
                     blob_count = len(blob_names)
-                    if getattr(schema, "row_count_metric", None) and blob_count > 1:
+                    if schema.row_count_metric and blob_count > 1:
                         schema.row_count_estimate = schema.row_count_metric * blob_count
                         schema.row_count_metric = None
                         self.statistics.estimated_row_count += schema.row_count_estimate
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1707"
+version = "0.26.0-beta.1710"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/src/cpp/disk_io.cpp b/src/cpp/disk_io.cpp
@@ -176,7 +176,7 @@ int read_all_mmap(const char* path, uint8_t** dst, size_t* out_len) {
         return 0;
     }
     
-    void* mapped = mmap(NULL, size, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);
+    void* mapped = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
     close(fd);
 
     if (mapped == MAP_FAILED) {

Original file line number	Diff line number	Diff line change
`@@ -176,7 +176,7 @@ int read_all_mmap(const char* path, uint8_t** dst, size_t* out_len) {`
`176`	`176`	`return 0;`
`177`	`177`	`}`
`178`	`178`
`179`		`- void* mapped = mmap(NULL, size, PROT_READ, MAP_PRIVATE \| MAP_POPULATE, fd, 0);`
	`179`	`+ void* mapped = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);`
`180`	`180`	`close(fd);`
`181`	`181`
`182`	`182`	`if (mapped == MAP_FAILED) {`