linux disk performance

joocer · joocer · commit 6bf8d23837ad · 2025-10-29T18:34:06.000Z
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1707
+__build__ = 1708
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1707"
+__version__ = "0.26.0-beta.1708"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/connectors/disk_connector.py b/opteryx/connectors/disk_connector.py
@@ -8,10 +8,8 @@
 given as a folder on local disk
 """
 
-import contextlib
 import mmap
 import os
-import platform
 import time
 from typing import Dict
 from typing import List
@@ -34,17 +32,6 @@
 from opteryx.utils.file_decoders import get_decoder
 
 OS_SEP = os.sep
-IS_LINUX = platform.system() == "Linux"
-
-
-# prefer MAP_PRIVATE and on Linux enable MAP_POPULATE to fault pages in
-flags = mmap.MAP_PRIVATE
-if IS_LINUX:
-    with contextlib.suppress(Exception):
-        flags |= getattr(mmap, "MAP_POPULATE", 0)
-mmap_config = {}
-mmap_config["flags"] = flags
-mmap_config["prot"] = mmap.PROT_READ
 
 
 class DiskConnector(BaseConnector, Partitionable, PredicatePushable, LimitPushable, Statistics):
@@ -157,7 +144,8 @@ def read_blob(
             return result
         finally:
             # CRITICAL: Clean up the memory mapping
-            unmap_memory(mmap_obj)
+            pass
+            # unmap_memory(mmap_obj)
 
     @single_item_cache
     def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
@@ -219,7 +207,7 @@ def read_dataset(
             decoder = get_decoder(blob_name)
             try:
                 if not just_schema:
-                    num_rows, _, raw_bytes, decoded = self.read_blob(
+                    num_rows, _, raw_size, decoded = self.read_blob(
                         blob_name=blob_name,
                         decoder=decoder,
                         just_schema=False,
@@ -234,8 +222,8 @@ def read_dataset(
 
                     self.statistics.rows_seen += num_rows
                     self.rows_seen += num_rows
-                    self.statistics.bytes_raw += raw_bytes
                     self.blobs_seen += 1
+                    self.statistics.bytes_raw += raw_size
                     yield decoded
 
                     # if we have read all the rows we need to stop
@@ -247,14 +235,9 @@ def read_dataset(
                         decoder=decoder,
                         just_schema=True,
                     )
-                    # Some decoders may return None for schema (e.g. unreadable
-                    # or undecidable schema). Skip those and continue with the
-                    # next blob instead of trying to access attributes on None.
-                    if schema is None:
-                        continue
                     # if we have more than one blob we need to estimate the row count
                     blob_count = len(blob_names)
-                    if getattr(schema, "row_count_metric", None) and blob_count > 1:
+                    if schema.row_count_metric and blob_count > 1:
                         schema.row_count_estimate = schema.row_count_metric * blob_count
                         schema.row_count_metric = None
                         self.statistics.estimated_row_count += schema.row_count_estimate
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1707"
+version = "0.26.0-beta.1708"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}