Skip to content

Commit 675d15c

Browse files
committed
cython-performance
1 parent 7020ecc commit 675d15c

File tree

9 files changed

+634
-285
lines changed

9 files changed

+634
-285
lines changed

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1664
4+
__build__ = 1665
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1664"
6+
__version__ = "0.26.0-beta.1665"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/compiled/structures/bloom_filter.pxd

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@ cimport numpy
1111

1212
# Declaration of the BloomFilter class
1313
cdef class BloomFilter:
14-
cdef unsigned char* bit_array
15-
cdef uint32_t bit_array_size
16-
cdef uint32_t byte_array_size
14+
cdef uint64_t* bit_array
15+
cdef uint32_t bit64_array_size
16+
cdef uint32_t bit_array_size_bits
17+
cdef uint64_t bit_mask
1718

1819
cdef inline void _add(self, const uint64_t item)
1920
cdef inline bint _possibly_contains(self, const uint64_t item)

opteryx/compiled/structures/bloom_filter.pyx

Lines changed: 74 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ is the limit of what we think we should speculatively build.
3434
"""
3535

3636
from libc.stdlib cimport calloc, free
37-
from libc.stdint cimport uint8_t
37+
from libc.stdint cimport uint64_t, uint32_t
3838

3939
from opteryx.compiled.table_ops.hash_ops cimport compute_row_hashes
4040
from opteryx.compiled.table_ops.null_avoidant_ops cimport non_null_row_indices
@@ -45,43 +45,39 @@ cimport numpy
4545
cdef extern from "<stdint.h>":
4646
ctypedef unsigned long uintptr_t
4747

48-
# Define sizes for the Bloom filters
49-
cdef uint32_t BYTE_ARRAY_SIZE_TINY = 1 * 1024 # 1 KB for <= 1K records
50-
cdef uint32_t BYTE_ARRAY_SIZE_SMALL = 64 * 1024 # 64 KB for <= 60K records
51-
cdef uint32_t BYTE_ARRAY_SIZE_LARGE = 1024 * 1024 # 1 MB for <= 1M records
52-
cdef uint32_t BYTE_ARRAY_SIZE_HUGE = 16 * 1024 * 1024 # 16 MB for <= 16M records
53-
54-
cdef uint32_t BIT_ARRAY_SIZE_TINY = BYTE_ARRAY_SIZE_TINY << 3 # 8 Kbits
55-
cdef uint32_t BIT_ARRAY_SIZE_SMALL = BYTE_ARRAY_SIZE_SMALL << 3 # 512 Kbits
56-
cdef uint32_t BIT_ARRAY_SIZE_LARGE = BYTE_ARRAY_SIZE_LARGE << 3 # 8 Mbits
57-
cdef uint32_t BIT_ARRAY_SIZE_HUGE = BYTE_ARRAY_SIZE_HUGE << 3 # 128 Mbits
48+
# Define sizes for the Bloom filters - now in 64-bit chunks
49+
cdef uint32_t BIT64_ARRAY_SIZE_TINY = 128 # 128 * 64 = 8,192 bits
50+
cdef uint32_t BIT64_ARRAY_SIZE_SMALL = 8 * 1024 # 8K * 64 = 524,288 bits
51+
cdef uint32_t BIT64_ARRAY_SIZE_LARGE = 128 * 1024 # 128K * 64 = 8,388,608 bits
52+
cdef uint32_t BIT64_ARRAY_SIZE_HUGE = 2 * 1024 * 1024 # 2M * 64 = 134,217,728 bits
5853

54+
# Golden ratio constant for second hash
55+
cdef uint64_t GOLDEN_RATIO = 0x9E3779B97F4A7C15ULL
5956

6057
cdef class BloomFilter:
61-
# defined in the .pxd file only - here so they aren't magic
62-
# cdef unsigned char* bit_array
63-
# cdef uint32_t bit_array_size
64-
# cdef uint32_t byte_array_size
6558

6659
def __cinit__(self, uint32_t expected_records=50000):
6760
"""Initialize Bloom Filter based on expected number of records."""
6861
if expected_records <= 1_000:
69-
self.byte_array_size = BYTE_ARRAY_SIZE_TINY
70-
self.bit_array_size = BIT_ARRAY_SIZE_TINY
62+
self.bit64_array_size = BIT64_ARRAY_SIZE_TINY
63+
self.bit_array_size_bits = BIT64_ARRAY_SIZE_TINY * 64
7164
elif expected_records <= 62_000:
72-
self.byte_array_size = BYTE_ARRAY_SIZE_SMALL
73-
self.bit_array_size = BIT_ARRAY_SIZE_SMALL
65+
self.bit64_array_size = BIT64_ARRAY_SIZE_SMALL
66+
self.bit_array_size_bits = BIT64_ARRAY_SIZE_SMALL * 64
7467
elif expected_records <= 1_000_000:
75-
self.byte_array_size = BYTE_ARRAY_SIZE_LARGE
76-
self.bit_array_size = BIT_ARRAY_SIZE_LARGE
68+
self.bit64_array_size = BIT64_ARRAY_SIZE_LARGE
69+
self.bit_array_size_bits = BIT64_ARRAY_SIZE_LARGE * 64
7770
elif expected_records <= 16_000_000:
78-
self.byte_array_size = BYTE_ARRAY_SIZE_HUGE
79-
self.bit_array_size = BIT_ARRAY_SIZE_HUGE
71+
self.bit64_array_size = BIT64_ARRAY_SIZE_HUGE
72+
self.bit_array_size_bits = BIT64_ARRAY_SIZE_HUGE * 64
8073
else:
8174
raise ValueError("Too many records for this Bloom filter implementation")
8275

83-
# Allocate memory
84-
self.bit_array = <unsigned char*>calloc(self.byte_array_size, sizeof(uint8_t))
76+
# Precompute mask for faster modulo operations
77+
self.bit_mask = self.bit_array_size_bits - 1
78+
79+
# Allocate 64-bit aligned memory
80+
self.bit_array = <uint64_t*>calloc(self.bit64_array_size, sizeof(uint64_t))
8581
if not self.bit_array:
8682
raise MemoryError("Failed to allocate memory for the Bloom filter.")
8783

@@ -90,61 +86,80 @@ cdef class BloomFilter:
9086
free(self.bit_array)
9187

9288
cdef inline void _add(self, const uint64_t item):
93-
cdef uint32_t h1, h2
89+
cdef uint64_t h1, h2
90+
91+
# Use bit mask for fast modulo (works because sizes are powers of 2)
92+
h1 = item & self.bit_mask
93+
# Better hash mixing for second position
94+
h2 = (item * GOLDEN_RATIO) & self.bit_mask
9495

95-
h1 = item & (self.bit_array_size - 1)
96-
# Apply the golden ratio to the item and use a mask to keep within the
97-
# size of the bit array.
98-
h2 = (item * 2654435769U) & (self.bit_array_size - 1)
99-
self.bit_array[h1 >> 3] |= 1 << (h1 & 7)
100-
self.bit_array[h2 >> 3] |= 1 << (h2 & 7)
96+
# Set bits using 64-bit operations
97+
self.bit_array[h1 >> 6] |= (<uint64_t>1) << (h1 & 0x3F)
98+
self.bit_array[h2 >> 6] |= (<uint64_t>1) << (h2 & 0x3F)
10199

102100
cpdef void add(self, const uint64_t item):
103101
self._add(item)
104102

105103
cdef inline bint _possibly_contains(self, const uint64_t item):
106-
"""Check if the item might be in the set"""
107-
cdef uint32_t h1, h2
104+
cdef uint64_t h1, h2, mask1, mask2
108105

109-
h1 = item & (self.bit_array_size - 1)
110-
h2 = (item * 2654435769U) & (self.bit_array_size - 1)
111-
return (((self.bit_array[h1 >> 3] >> (h1 & 7)) & 1) != 0) and \
112-
(((self.bit_array[h2 >> 3] >> (h2 & 7)) & 1) != 0)
106+
h1 = item & self.bit_mask
107+
h2 = (item * GOLDEN_RATIO) & self.bit_mask
108+
109+
# Check both bits with single 64-bit load each
110+
mask1 = (<uint64_t>1) << (h1 & 0x3F)
111+
mask2 = (<uint64_t>1) << (h2 & 0x3F)
112+
113+
return (self.bit_array[h1 >> 6] & mask1) != 0 and \
114+
(self.bit_array[h2 >> 6] & mask2) != 0
113115

114116
cpdef bint possibly_contains(self, const uint64_t item):
115117
return self._possibly_contains(item)
116118

117119
cpdef numpy.ndarray[numpy.npy_bool, ndim=1] possibly_contains_many(self, object relation, list columns):
118120
"""
119-
Return a boolean array indicating whether each row in `relation` might be in the Bloom filter.
120-
Null-containing rows are considered not present (False).
121+
Optimized batch checking with better memory access patterns.
121122
"""
122123
cdef Py_ssize_t num_rows = relation.num_rows
123-
cdef numpy.ndarray[numpy.npy_bool, ndim=1] result = numpy.zeros(num_rows, dtype=numpy.bool)
124+
cdef numpy.ndarray[numpy.npy_bool, ndim=1] result = numpy.zeros(num_rows, dtype=numpy.bool_)
124125
cdef uint8_t[::1] result_view = result
125126
cdef int64_t[::1] valid_row_ids = non_null_row_indices(relation, columns)
126127
cdef Py_ssize_t num_valid_rows = valid_row_ids.shape[0]
127128
cdef numpy.ndarray[numpy.uint64_t, ndim=1] row_hashes_np = numpy.zeros(num_rows, dtype=numpy.uint64)
128129
cdef uint64_t[::1] row_hashes = row_hashes_np
129130
cdef Py_ssize_t i
130131
cdef int64_t row_id
132+
cdef uint64_t hash_val, h1, h2, mask1, mask2
131133

132134
if num_valid_rows == 0:
133135
return result
134136

135137
# Compute hashes only for non-null rows
136138
compute_row_hashes(relation, columns, row_hashes)
137139

140+
# Precompute constants
141+
cdef uint64_t bit_mask = self.bit_mask
142+
cdef uint64_t golden_ratio = GOLDEN_RATIO
143+
cdef uint64_t* bit_array = self.bit_array
144+
138145
for i in range(num_valid_rows):
139146
row_id = valid_row_ids[i]
140-
result_view[row_id] = self._possibly_contains(row_hashes[row_id])
147+
hash_val = row_hashes[row_id]
148+
149+
h1 = hash_val & bit_mask
150+
h2 = (hash_val * golden_ratio) & bit_mask
151+
152+
mask1 = (<uint64_t>1) << (h1 & 0x3F)
153+
mask2 = (<uint64_t>1) << (h2 & 0x3F)
154+
155+
result_view[row_id] = (bit_array[h1 >> 6] & mask1) != 0 and \
156+
(bit_array[h2 >> 6] & mask2) != 0
141157

142158
return result
143159

144160
cpdef BloomFilter create_bloom_filter(object relation, list columns):
145161
"""
146-
Create a BloomFilter from the specified `columns` in `relation`,
147-
ignoring rows with nulls in any of the columns.
162+
Optimized Bloom filter creation with better cache behavior.
148163
"""
149164
cdef:
150165
Py_ssize_t num_rows = relation.num_rows
@@ -153,16 +168,30 @@ cpdef BloomFilter create_bloom_filter(object relation, list columns):
153168
numpy.ndarray[numpy.uint64_t, ndim=1] row_hashes_np = numpy.empty(num_rows, dtype=numpy.uint64)
154169
uint64_t[::1] row_hashes = row_hashes_np
155170
Py_ssize_t i
171+
int64_t row_id
156172
BloomFilter bf = BloomFilter(num_valid_rows)
173+
uint64_t hash_val, h1, h2
157174

158175
if num_valid_rows == 0:
159176
return bf
160177

161178
# Populate row hashes using the selected columns
162179
compute_row_hashes(relation, columns, row_hashes)
163180

181+
# Precompute constants for faster access
182+
cdef uint64_t bit_mask = bf.bit_mask
183+
cdef uint64_t golden_ratio = GOLDEN_RATIO
184+
cdef uint64_t* bit_array = bf.bit_array
185+
164186
# Add to bloom filter
165187
for i in range(num_valid_rows):
166-
bf._add(row_hashes[valid_row_ids[i]])
188+
row_id = valid_row_ids[i]
189+
hash_val = row_hashes[row_id]
190+
191+
h1 = hash_val & bit_mask
192+
h2 = (hash_val * golden_ratio) & bit_mask
193+
194+
bit_array[h1 >> 6] |= (<uint64_t>1) << (h1 & 0x3F)
195+
bit_array[h2 >> 6] |= (<uint64_t>1) << (h2 & 0x3F)
167196

168197
return bf

0 commit comments

Comments
 (0)