Skip to content

Commit fe38b3b

Browse files
authored
Merge pull request #2873 from mabel-dev/clickbench-performance-regression-investigation-1
fix disk io errors
2 parents 4b1b389 + 664cc26 commit fe38b3b

File tree

11 files changed

+1597
-3
lines changed

11 files changed

+1597
-3
lines changed

examples/disk_reader_usage.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Quick reference examples for the disk_reader module.
4+
"""
5+
6+
from opteryx.compiled.io.disk_reader import read_file
7+
from opteryx.compiled.io.disk_reader import read_file_to_bytes
8+
9+
10+
# Example 1: Basic file reading
11+
def example_basic():
12+
"""Read a file and get its contents."""
13+
data = read_file("temp.json") # Returns memoryview
14+
print(f"File size: {len(data)} bytes")
15+
16+
# Convert to bytes if needed
17+
data_bytes = bytes(data)
18+
print(f"First 50 chars: {data_bytes[:50]}")
19+
20+
21+
# Example 2: Stream large files without cache pollution
22+
def example_streaming():
23+
"""Process multiple large files efficiently."""
24+
large_files = ["planets-gw0.duckdb", "planets-gw1.duckdb"]
25+
26+
for filename in large_files:
27+
# Read and evict from cache to save memory
28+
data = read_file(filename, drop_after=True)
29+
print(f"{filename}: {len(data):,} bytes")
30+
31+
32+
# Example 3: Zero-copy operations with memoryview
33+
def example_zero_copy():
34+
"""Efficiently slice data without copying."""
35+
data = read_file("temp.csv")
36+
37+
# These operations don't copy the underlying data
38+
first_line = data[:data.tobytes().find(b'\n')]
39+
40+
print(f"First line: {bytes(first_line)}")
41+
42+
43+
# Example 4: Using read_file_to_bytes for convenience
44+
def example_bytes():
45+
"""Get bytes directly instead of memoryview."""
46+
data = read_file_to_bytes("temp.md")
47+
48+
# Can use all bytes methods directly
49+
lines = data.split(b'\n')
50+
print(f"Number of lines: {len(lines)}")
51+
52+
53+
# Example 5: I/O hints for optimal performance
54+
def example_io_hints():
55+
"""Control caching behavior for different scenarios."""
56+
57+
# For large sequential reads (optimal)
58+
data = read_file("large_file.bin", sequential=True, willneed=True)
59+
60+
# For random access patterns
61+
data = read_file("index_file.bin", sequential=False)
62+
63+
# For one-time processing of huge files
64+
data = read_file("temporary_data.bin", drop_after=True)
65+
66+
67+
if __name__ == "__main__":
68+
import sys
69+
70+
print("disk_reader Quick Examples")
71+
print("=" * 60)
72+
73+
try:
74+
print("\n1. Basic Reading:")
75+
example_basic()
76+
77+
print("\n2. Streaming Large Files:")
78+
example_streaming()
79+
80+
print("\n3. Zero-Copy Operations:")
81+
example_zero_copy()
82+
83+
print("\n4. Bytes Convenience Method:")
84+
example_bytes()
85+
86+
print("\n" + "=" * 60)
87+
print("✓ All examples completed successfully!")
88+
89+
except Exception as e:
90+
print(f"\nError: {e}")
91+
sys.exit(1)

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1704
4+
__build__ = 1705
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1704"
6+
__version__ = "0.26.0-beta.1705"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/compiled/io/disk_reader.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#ifndef DISK_READER_H
2+
#define DISK_READER_H
3+
4+
#include <cstddef>
5+
#include <cstdint>
6+
7+
/**
8+
* Fast disk reader with platform-specific I/O optimizations
9+
*
10+
* @param path File path to read
11+
* @param dst Destination buffer (must be pre-allocated)
12+
* @param out_len Output parameter for bytes read
13+
* @param sequential Hint for sequential access pattern
14+
* @param willneed Hint that data will be needed soon (prefetch)
15+
* @param drop_after Drop page cache after reading
16+
* @return 0 on success, negative errno on failure
17+
*/
18+
int read_all_pread(const char* path, uint8_t* dst, size_t* out_len,
19+
bool sequential, bool willneed, bool drop_after);
20+
21+
/**
22+
* Memory-map a file for reading
23+
*
24+
* @param path File path to map
25+
* @param dst Output parameter for mapped memory address
26+
* @param out_len Output parameter for file size
27+
* @return 0 on success, negative errno on failure
28+
*/
29+
int read_all_mmap(const char* path, uint8_t** dst, size_t* out_len);
30+
31+
/**
32+
* Unmap memory that was mapped with read_all_mmap
33+
*
34+
* @param addr Address to unmap
35+
* @param size Size of the mapped region
36+
* @return 0 on success, negative errno on failure
37+
*/
38+
int unmap_memory_c(uint8_t* addr, size_t size);
39+
40+
#endif // DISK_READER_H

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.0-beta.1704"
3+
version = "0.26.0-beta.1705"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}

src/cpp/disk_io.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,15 @@ int read_all_mmap(const char* path, uint8_t** dst, size_t* out_len) {
167167
}
168168

169169
size_t size = static_cast<size_t>(st.st_size);
170+
171+
// Handle empty files - mmap doesn't work with size 0
172+
if (size == 0) {
173+
close(fd);
174+
*dst = nullptr;
175+
*out_len = 0;
176+
return 0;
177+
}
178+
170179
void* mapped = mmap(NULL, size, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);
171180
close(fd);
172181

@@ -191,6 +200,15 @@ int read_all_mmap(const char* path, uint8_t** dst, size_t* out_len) {
191200
}
192201

193202
size_t size = static_cast<size_t>(st.st_size);
203+
204+
// Handle empty files - mmap doesn't work with size 0
205+
if (size == 0) {
206+
close(fd);
207+
*dst = nullptr;
208+
*out_len = 0;
209+
return 0;
210+
}
211+
194212
void* mapped = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
195213
close(fd);
196214

@@ -214,6 +232,14 @@ int read_all_mmap(const char* path, uint8_t** dst, size_t* out_len) {
214232
DWORD sizeLow = GetFileSize(hFile, &sizeHigh);
215233
size_t size = (static_cast<size_t>(sizeHigh) << 32) | sizeLow;
216234

235+
// Handle empty files
236+
if (size == 0) {
237+
CloseHandle(hFile);
238+
*dst = nullptr;
239+
*out_len = 0;
240+
return 0;
241+
}
242+
217243
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
218244
if (!hMapping) {
219245
CloseHandle(hFile);

tests/compiled/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Compiled module tests

tests/compiled/io/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# I/O module tests

0 commit comments

Comments
 (0)