Skip to content

Commit 2ea26aa

Browse files
authored
Merge pull request #2879 from mabel-dev/clickbench-performance-regression-investigation-1
linux disk performance
2 parents 945862e + e7d31f3 commit 2ea26aa

File tree

4 files changed

+10
-28
lines changed

4 files changed

+10
-28
lines changed

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1707
4+
__build__ = 1710
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1707"
6+
__version__ = "0.26.0-beta.1710"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/connectors/disk_connector.py

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,7 @@
88
given as a folder on local disk
99
"""
1010

11-
import contextlib
12-
import mmap
1311
import os
14-
import platform
1512
import time
1613
from typing import Dict
1714
from typing import List
@@ -34,17 +31,6 @@
3431
from opteryx.utils.file_decoders import get_decoder
3532

3633
OS_SEP = os.sep
37-
IS_LINUX = platform.system() == "Linux"
38-
39-
40-
# prefer MAP_PRIVATE and on Linux enable MAP_POPULATE to fault pages in
41-
flags = mmap.MAP_PRIVATE
42-
if IS_LINUX:
43-
with contextlib.suppress(Exception):
44-
flags |= getattr(mmap, "MAP_POPULATE", 0)
45-
mmap_config = {}
46-
mmap_config["flags"] = flags
47-
mmap_config["prot"] = mmap.PROT_READ
4834

4935

5036
class DiskConnector(BaseConnector, Partitionable, PredicatePushable, LimitPushable, Statistics):
@@ -128,7 +114,7 @@ def read_blob(
128114
If an I/O error occurs while reading the file.
129115
"""
130116
from opteryx.compiled.io.disk_reader import read_file_mmap
131-
from opteryx.compiled.io.disk_reader import unmap_memory
117+
#from opteryx.compiled.io.disk_reader import unmap_memory
132118

133119
# Read using mmap for maximum speed
134120
mmap_obj = read_file_mmap(blob_name)
@@ -157,7 +143,8 @@ def read_blob(
157143
return result
158144
finally:
159145
# CRITICAL: Clean up the memory mapping
160-
unmap_memory(mmap_obj)
146+
pass
147+
# unmap_memory(mmap_obj)
161148

162149
@single_item_cache
163150
def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
@@ -219,7 +206,7 @@ def read_dataset(
219206
decoder = get_decoder(blob_name)
220207
try:
221208
if not just_schema:
222-
num_rows, _, raw_bytes, decoded = self.read_blob(
209+
num_rows, _, raw_size, decoded = self.read_blob(
223210
blob_name=blob_name,
224211
decoder=decoder,
225212
just_schema=False,
@@ -234,8 +221,8 @@ def read_dataset(
234221

235222
self.statistics.rows_seen += num_rows
236223
self.rows_seen += num_rows
237-
self.statistics.bytes_raw += raw_bytes
238224
self.blobs_seen += 1
225+
self.statistics.bytes_raw += raw_size
239226
yield decoded
240227

241228
# if we have read all the rows we need to stop
@@ -247,14 +234,9 @@ def read_dataset(
247234
decoder=decoder,
248235
just_schema=True,
249236
)
250-
# Some decoders may return None for schema (e.g. unreadable
251-
# or undecidable schema). Skip those and continue with the
252-
# next blob instead of trying to access attributes on None.
253-
if schema is None:
254-
continue
255237
# if we have more than one blob we need to estimate the row count
256238
blob_count = len(blob_names)
257-
if getattr(schema, "row_count_metric", None) and blob_count > 1:
239+
if schema.row_count_metric and blob_count > 1:
258240
schema.row_count_estimate = schema.row_count_metric * blob_count
259241
schema.row_count_metric = None
260242
self.statistics.estimated_row_count += schema.row_count_estimate

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.0-beta.1707"
3+
version = "0.26.0-beta.1710"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}

src/cpp/disk_io.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ int read_all_mmap(const char* path, uint8_t** dst, size_t* out_len) {
176176
return 0;
177177
}
178178

179-
void* mapped = mmap(NULL, size, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);
179+
void* mapped = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
180180
close(fd);
181181

182182
if (mapped == MAP_FAILED) {

0 commit comments

Comments
 (0)