Skip to content

Commit 6bf8d23

Browse files
committed
linux disk performance
1 parent 6f24deb commit 6bf8d23

File tree

3 files changed

+8
-25
lines changed

3 files changed

+8
-25
lines changed

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1707
4+
__build__ = 1708
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1707"
6+
__version__ = "0.26.0-beta.1708"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/connectors/disk_connector.py

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88
given as a folder on local disk
99
"""
1010

11-
import contextlib
1211
import mmap
1312
import os
14-
import platform
1513
import time
1614
from typing import Dict
1715
from typing import List
@@ -34,17 +32,6 @@
3432
from opteryx.utils.file_decoders import get_decoder
3533

3634
OS_SEP = os.sep
37-
IS_LINUX = platform.system() == "Linux"
38-
39-
40-
# prefer MAP_PRIVATE and on Linux enable MAP_POPULATE to fault pages in
41-
flags = mmap.MAP_PRIVATE
42-
if IS_LINUX:
43-
with contextlib.suppress(Exception):
44-
flags |= getattr(mmap, "MAP_POPULATE", 0)
45-
mmap_config = {}
46-
mmap_config["flags"] = flags
47-
mmap_config["prot"] = mmap.PROT_READ
4835

4936

5037
class DiskConnector(BaseConnector, Partitionable, PredicatePushable, LimitPushable, Statistics):
@@ -157,7 +144,8 @@ def read_blob(
157144
return result
158145
finally:
159146
# CRITICAL: Clean up the memory mapping
160-
unmap_memory(mmap_obj)
147+
pass
148+
# unmap_memory(mmap_obj)
161149

162150
@single_item_cache
163151
def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
@@ -219,7 +207,7 @@ def read_dataset(
219207
decoder = get_decoder(blob_name)
220208
try:
221209
if not just_schema:
222-
num_rows, _, raw_bytes, decoded = self.read_blob(
210+
num_rows, _, raw_size, decoded = self.read_blob(
223211
blob_name=blob_name,
224212
decoder=decoder,
225213
just_schema=False,
@@ -234,8 +222,8 @@ def read_dataset(
234222

235223
self.statistics.rows_seen += num_rows
236224
self.rows_seen += num_rows
237-
self.statistics.bytes_raw += raw_bytes
238225
self.blobs_seen += 1
226+
self.statistics.bytes_raw += raw_size
239227
yield decoded
240228

241229
# if we have read all the rows we need to stop
@@ -247,14 +235,9 @@ def read_dataset(
247235
decoder=decoder,
248236
just_schema=True,
249237
)
250-
# Some decoders may return None for schema (e.g. unreadable
251-
# or undecidable schema). Skip those and continue with the
252-
# next blob instead of trying to access attributes on None.
253-
if schema is None:
254-
continue
255238
# if we have more than one blob we need to estimate the row count
256239
blob_count = len(blob_names)
257-
if getattr(schema, "row_count_metric", None) and blob_count > 1:
240+
if schema.row_count_metric and blob_count > 1:
258241
schema.row_count_estimate = schema.row_count_metric * blob_count
259242
schema.row_count_metric = None
260243
self.statistics.estimated_row_count += schema.row_count_estimate

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.0-beta.1707"
3+
version = "0.26.0-beta.1708"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}

0 commit comments

Comments
 (0)