Skip to content

Commit 6442f5a

Browse files
authored
Merge pull request #2856 from mabel-dev/performance-tweaks
performance-tweaks
2 parents 121615c + 0b0829d commit 6442f5a

26 files changed

+1991
-360
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
List hashing fast-path — opteryx compiled/table_ops/hash_ops
2+
3+
Purpose
4+
- Document when the list hashing implementation takes a buffer-aware fast path (no Python object allocation) and when it falls back to per-element Python hashing.
5+
6+
Where it lives
7+
- Implementation: `opteryx/compiled/table_ops/hash_ops.pyx` — function `process_list_chunk`.
8+
- Tests: `tests/unit/diagnostic/test_list_fast_paths.py`.
9+
10+
Fast-path conditions
11+
- The list handler will use buffer-aware, zero-Python-object inner loops when the list child type is one of:
12+
- integer types (signed/unsigned, fixed-width)
13+
- floating point types
14+
- temporal types (timestamps/dates)
15+
- string or binary child types (string buffers + offsets)
16+
17+
- For the above child types the code reads child buffers directly and computes element hashes without creating Python objects. This gives a large performance win for dense numeric/string lists.
18+
19+
Fallback cases
20+
- If the list child type is a complex/unrecognized Arrow type (for example, structs, maps, or arbitrary Python objects), the implementation falls back to slicing the child array and calling Python-level hashing for each element. This is correct but slower.
21+
22+
Correctness notes
23+
- All paths account for Arrow `chunk.offset` on both the parent list array and on the child array. Validity bitmaps are checked with proper bit/byte arithmetic.
24+
- 8-byte primitive loads are done via `memcpy` into a local `uint64_t` to avoid unaligned memory reads.
25+
26+
Testing and benchmarks
27+
- Unit tests in `tests/unit/diagnostic/test_list_fast_paths.py` validate parity between flat and chunked arrays and basic correctness for nested and boolean lists.
28+
- Benchmarks live in `tests/performance/benchmarks/bench_hash_ops.py`.
29+
30+
When to extend
31+
- If you see nested lists of primitives commonly in workloads, consider implementing a dedicated nested-list stack-based fast path to avoid repeated slice() allocations.
32+
- If child types are frequently small fixed-width types, additional micro-optimizations (incremental bit/byte pointers rather than recomputing shifts) can pay off.
33+
34+
"Why not always buffer-aware?"
35+
- Some Arrow child types are not stored as simple contiguous buffers accessible by offset arithmetic (e.g., structs or other nested variable-width complex types). In those cases, the safe and correct approach is to create Python objects and hash them.
36+
37+
Contact
38+
- If you have a representative large dataset that still performs poorly, attach it or a small reproducer and I'll benchmark and iterate.

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1666
4+
__build__ = 1676
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1666"
6+
__version__ = "0.26.0-beta.1676"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/compiled/structures/buffers.pxd

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ cdef extern from "intbuffer.h" namespace "":
1717
CIntBuffer(size_t size_hint)
1818
void append(int64_t value)
1919
void extend(const vector[int64_t]& values)
20+
void extend(const int64_t* values, size_t count)
2021
const int64_t* data() const
2122
size_t size() const
2223

@@ -29,3 +30,6 @@ cdef class IntBuffer:
2930
cpdef void extend(self, iterable)
3031
cpdef numpy.ndarray[int64_t, ndim=1] to_numpy(self)
3132
cpdef size_t size(self)
33+
cpdef void extend_numpy(self, numpy.ndarray[int64_t, ndim=1] arr)
34+
cpdef void reserve(self, size_t capacity)
35+
cpdef void append_batch(self, int64_t[::1] values)

opteryx/compiled/structures/buffers.pyx

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,52 +13,81 @@ numpy.import_array()
1313

1414
from libc.stdint cimport int64_t
1515
from libcpp.vector cimport vector
16+
from libc.string cimport memcpy
1617

1718
cdef extern from "intbuffer.h":
1819
cdef cppclass CIntBuffer:
19-
2020
CIntBuffer(size_t size_hint)
21-
inline void append(int64_t value)
22-
inline void extend(const vector[int64_t]& values)
23-
inline const int64_t* data() const
24-
inline size_t size()
21+
void append(int64_t value) nogil
22+
void extend(const vector[int64_t]& values) nogil
23+
void extend(const int64_t* data, size_t count) nogil
24+
const int64_t* data() nogil
25+
size_t size() nogil
2526

2627
cdef class IntBuffer:
27-
"""
28-
Python wrapper for the C++ IntBuffer class.
29-
"""
30-
#cdef CIntBuffer* c_buffer
3128

32-
def __cinit__(self, size_hint: int = 1024):
29+
def __cinit__(self, size_t size_hint = 1024):
3330
self.c_buffer = new CIntBuffer(size_hint)
3431

3532
def __dealloc__(self):
3633
del self.c_buffer
3734

3835
cpdef void append(self, int64_t value):
39-
""" Append an integer to the buffer. """
36+
"""Append an integer to the buffer."""
4037
self.c_buffer.append(value)
4138

42-
cpdef void extend(self, iterable):
43-
""" Extend the buffer with an iterable of integers. """
44-
cdef vector[int64_t] values = iterable
45-
self.c_buffer.extend(values)
39+
cpdef void append_batch(self, int64_t[::1] values):
40+
"""Append a batch of values efficiently."""
41+
cdef size_t n = values.shape[0]
42+
if n > 0:
43+
self.c_buffer.extend(&values[0], n)
44+
45+
cpdef void extend(self, object iterable):
46+
"""Extend the buffer with an iterable of integers."""
47+
# Fast path for numpy arrays
48+
if isinstance(iterable, numpy.ndarray):
49+
arr = numpy.ascontiguousarray(iterable, dtype=numpy.int64)
50+
self.extend_numpy(arr)
51+
return
52+
53+
# Fast path for lists/tuples - pre-allocate and copy
54+
cdef size_t estimated_size
55+
estimated_size = len(iterable)
56+
57+
cdef vector[int64_t] vec
58+
if estimated_size > 1000: # For large iterables, use vector
59+
vec.reserve(estimated_size)
60+
for item in iterable:
61+
vec.push_back(item)
62+
self.c_buffer.extend(vec)
63+
else:
64+
# Small iterables - just append one by one
65+
for item in iterable:
66+
self.c_buffer.append(item)
67+
68+
cpdef void extend_numpy(self, numpy.ndarray[int64_t, ndim=1] arr):
69+
"""Extend with numpy array - fastest method."""
70+
cdef size_t n = arr.shape[0]
71+
if n > 0:
72+
self.c_buffer.extend(&arr[0], n)
4673

4774
cpdef numpy.ndarray[int64_t, ndim=1] to_numpy(self):
48-
""" Convert the buffer to a NumPy array by copying data. """
75+
"""Convert the buffer to a NumPy array using memcpy."""
4976
cdef size_t size = self.c_buffer.size()
50-
cdef const int64_t* data_ptr = self.c_buffer.data()
5177

5278
if size == 0:
53-
return numpy.empty(0, dtype=numpy.int64) # Handle empty buffer case
79+
return numpy.empty(0, dtype=numpy.int64)
5480

55-
# Allocate a NumPy array and copy data
56-
arr = numpy.empty(size, dtype=numpy.int64)
57-
cdef int64_t[::1] arr_view = arr
58-
for i in range(size):
59-
arr_view[i] = data_ptr[i] # Copy values manually
81+
cdef const int64_t* data_ptr = self.c_buffer.data()
82+
cdef numpy.ndarray[int64_t, ndim=1] arr = numpy.empty(size, dtype=numpy.int64)
6083

84+
memcpy(<void*>&arr[0], <const void*>data_ptr, size * sizeof(int64_t))
6185
return arr
6286

6387
cpdef size_t size(self):
6488
return self.c_buffer.size()
89+
90+
cpdef void reserve(self, size_t capacity):
91+
"""Reserve capacity for future appends."""
92+
# We'll need to add this method to the C++ class
93+
pass

0 commit comments

Comments
 (0)