@@ -34,7 +34,7 @@ is the limit of what we think we should speculatively build.
3434""" 
3535
3636from  libc.stdlib cimport calloc, free
37- from  libc.stdint cimport uint8_t 
37+ from  libc.stdint cimport uint64_t, uint32_t 
3838
3939from  opteryx.compiled.table_ops.hash_ops cimport compute_row_hashes
4040from  opteryx.compiled.table_ops.null_avoidant_ops cimport non_null_row_indices
@@ -45,43 +45,39 @@ cimport numpy
4545cdef extern from  " <stdint.h>" 
4646    ctypedef unsigned  long  uintptr_t
4747
48- #  Define sizes for the Bloom filters
49- cdef uint32_t BYTE_ARRAY_SIZE_TINY =  1  *  1024           #  1 KB for <= 1K records
50- cdef uint32_t BYTE_ARRAY_SIZE_SMALL =  64  *  1024         #  64 KB for <= 60K records
51- cdef uint32_t BYTE_ARRAY_SIZE_LARGE =  1024  *  1024       #  1 MB for <=  1M records
52- cdef uint32_t BYTE_ARRAY_SIZE_HUGE =  16  *  1024  *  1024   #  16 MB for <= 16M records
53- 
54- cdef uint32_t BIT_ARRAY_SIZE_TINY =  BYTE_ARRAY_SIZE_TINY <<  3     #  8 Kbits
55- cdef uint32_t BIT_ARRAY_SIZE_SMALL =  BYTE_ARRAY_SIZE_SMALL <<  3   #  512 Kbits
56- cdef uint32_t BIT_ARRAY_SIZE_LARGE =  BYTE_ARRAY_SIZE_LARGE <<  3   #  8 Mbits
57- cdef uint32_t BIT_ARRAY_SIZE_HUGE =  BYTE_ARRAY_SIZE_HUGE <<  3     #  128 Mbits
48+ #  Define sizes for the Bloom filters - now in 64-bit chunks
49+ cdef uint32_t BIT64_ARRAY_SIZE_TINY =  128   #  128 * 64 = 8,192 bits
50+ cdef uint32_t BIT64_ARRAY_SIZE_SMALL =  8  *  1024   #  8K * 64 = 524,288 bits
51+ cdef uint32_t BIT64_ARRAY_SIZE_LARGE =  128  *  1024   #  128K * 64 = 8,388,608 bits
52+ cdef uint32_t BIT64_ARRAY_SIZE_HUGE =  2  *  1024  *  1024   #  2M * 64 = 134,217,728 bits
5853
54+ #  Golden ratio constant for second hash
55+ cdef uint64_t GOLDEN_RATIO =  0x9E3779B97F4A7C15 ULL
5956
6057cdef class  BloomFilter:
61-     #  defined in the .pxd file only - here so they aren't magic
62-     #  cdef unsigned char* bit_array
63-     #  cdef uint32_t bit_array_size
64-     #  cdef uint32_t byte_array_size
6558
6659    def  __cinit__ (self , uint32_t expected_records = 50000 ):
6760        """ Initialize Bloom Filter based on expected number of records.""" 
6861        if  expected_records <=  1 _000:
69-             self .byte_array_size  =  BYTE_ARRAY_SIZE_TINY 
70-             self .bit_array_size  =  BIT_ARRAY_SIZE_TINY 
62+             self .bit64_array_size  =  BIT64_ARRAY_SIZE_TINY 
63+             self .bit_array_size_bits  =  BIT64_ARRAY_SIZE_TINY  *   64 
7164        elif  expected_records <=  62 _000:
72-             self .byte_array_size  =  BYTE_ARRAY_SIZE_SMALL 
73-             self .bit_array_size  =  BIT_ARRAY_SIZE_SMALL 
65+             self .bit64_array_size  =  BIT64_ARRAY_SIZE_SMALL 
66+             self .bit_array_size_bits  =  BIT64_ARRAY_SIZE_SMALL  *   64 
7467        elif  expected_records <=  1 _000_000:
75-             self .byte_array_size  =  BYTE_ARRAY_SIZE_LARGE 
76-             self .bit_array_size  =  BIT_ARRAY_SIZE_LARGE 
68+             self .bit64_array_size  =  BIT64_ARRAY_SIZE_LARGE 
69+             self .bit_array_size_bits  =  BIT64_ARRAY_SIZE_LARGE  *   64 
7770        elif  expected_records <=  16 _000_000:
78-             self .byte_array_size  =  BYTE_ARRAY_SIZE_HUGE 
79-             self .bit_array_size  =  BIT_ARRAY_SIZE_HUGE 
71+             self .bit64_array_size  =  BIT64_ARRAY_SIZE_HUGE 
72+             self .bit_array_size_bits  =  BIT64_ARRAY_SIZE_HUGE  *   64 
8073        else :
8174            raise  ValueError (" Too many records for this Bloom filter implementation" 
8275
83-         #  Allocate memory
84-         self .bit_array =  < unsigned  char * > calloc(self .byte_array_size, sizeof(uint8_t))
76+         #  Precompute mask for faster modulo operations
77+         self .bit_mask =  self .bit_array_size_bits -  1 
78+ 
79+         #  Allocate 64-bit aligned memory
80+         self .bit_array =  < uint64_t* > calloc(self .bit64_array_size, sizeof(uint64_t))
8581        if  not  self .bit_array:
8682            raise  MemoryError (" Failed to allocate memory for the Bloom filter." 
8783
@@ -90,61 +86,80 @@ cdef class BloomFilter:
9086            free(self .bit_array)
9187
9288    cdef inline void  _add(self , const uint64_t item):
93-         cdef uint32_t h1, h2
89+         cdef uint64_t h1, h2
90+ 
91+         #  Use bit mask for fast modulo (works because sizes are powers of 2)
92+         h1 =  item &  self .bit_mask
93+         #  Better hash mixing for second position
94+         h2 =  (item *  GOLDEN_RATIO) &  self .bit_mask
9495
95-         h1 =  item &  (self .bit_array_size -  1 )
96-         #  Apply the golden ratio to the item and use a mask to keep within the
97-         #  size of the bit array.
98-         h2 =  (item *  2654435769 U) &  (self .bit_array_size -  1 )
99-         self .bit_array[h1 >>  3 ] |=  1  <<  (h1 &  7 )
100-         self .bit_array[h2 >>  3 ] |=  1  <<  (h2 &  7 )
96+         #  Set bits using 64-bit operations
97+         self .bit_array[h1 >>  6 ] |=  (< uint64_t> 1 ) <<  (h1 &  0x3F )
98+         self .bit_array[h2 >>  6 ] |=  (< uint64_t> 1 ) <<  (h2 &  0x3F )
10199
102100    cpdef void  add(self , const uint64_t item):
103101        self ._add(item)
104102
105103    cdef inline bint _possibly_contains(self , const uint64_t item):
106-         """ Check if the item might be in the set""" 
107-         cdef uint32_t h1, h2
104+         cdef uint64_t h1, h2, mask1, mask2
108105
109-         h1 =  item &  (self .bit_array_size -  1 )
110-         h2 =  (item *  2654435769 U) &  (self .bit_array_size -  1 )
111-         return  (((self .bit_array[h1 >>  3 ] >>  (h1 &  7 )) &  1 ) !=  0 ) and  \
112-                (((self .bit_array[h2 >>  3 ] >>  (h2 &  7 )) &  1 ) !=  0 )
106+         h1 =  item &  self .bit_mask
107+         h2 =  (item *  GOLDEN_RATIO) &  self .bit_mask
108+ 
109+         #  Check both bits with single 64-bit load each
110+         mask1 =  (< uint64_t> 1 ) <<  (h1 &  0x3F )
111+         mask2 =  (< uint64_t> 1 ) <<  (h2 &  0x3F )
112+ 
113+         return  (self .bit_array[h1 >>  6 ] &  mask1) !=  0  and  \
114+                (self .bit_array[h2 >>  6 ] &  mask2) !=  0 
113115
114116    cpdef bint possibly_contains(self , const uint64_t item):
115117        return  self ._possibly_contains(item)
116118
117119    cpdef numpy.ndarray[numpy.npy_bool, ndim= 1 ] possibly_contains_many(self , object  relation, list  columns):
118120        """ 
119-         Return a boolean array indicating whether each row in `relation` might be in the Bloom filter. 
120-         Null-containing rows are considered not present (False). 
121+         Optimized batch checking with better memory access patterns. 
121122        """  
122123        cdef Py_ssize_t num_rows =  relation.num_rows
123-         cdef numpy.ndarray[numpy.npy_bool, ndim= 1 ] result =  numpy.zeros(num_rows, dtype = numpy.bool )
124+         cdef numpy.ndarray[numpy.npy_bool, ndim= 1 ] result =  numpy.zeros(num_rows, dtype = numpy.bool_ )
124125        cdef uint8_t[::1 ] result_view =  result
125126        cdef int64_t[::1 ] valid_row_ids =  non_null_row_indices(relation, columns)
126127        cdef Py_ssize_t num_valid_rows =  valid_row_ids.shape[0 ]
127128        cdef numpy.ndarray[numpy.uint64_t, ndim= 1 ] row_hashes_np =  numpy.zeros(num_rows, dtype = numpy.uint64)
128129        cdef uint64_t[::1 ] row_hashes =  row_hashes_np
129130        cdef Py_ssize_t i
130131        cdef int64_t row_id
132+         cdef uint64_t hash_val, h1, h2, mask1, mask2
131133
132134        if  num_valid_rows ==  0 :
133135            return  result
134136
135137        #  Compute hashes only for non-null rows
136138        compute_row_hashes(relation, columns, row_hashes)
137139
140+         #  Precompute constants
141+         cdef uint64_t bit_mask =  self .bit_mask
142+         cdef uint64_t golden_ratio =  GOLDEN_RATIO
143+         cdef uint64_t*  bit_array =  self .bit_array
144+ 
138145        for  i in  range (num_valid_rows):
139146            row_id =  valid_row_ids[i]
140-             result_view[row_id] =  self ._possibly_contains(row_hashes[row_id])
147+             hash_val =  row_hashes[row_id]
148+ 
149+             h1 =  hash_val &  bit_mask
150+             h2 =  (hash_val *  golden_ratio) &  bit_mask
151+ 
152+             mask1 =  (< uint64_t> 1 ) <<  (h1 &  0x3F )
153+             mask2 =  (< uint64_t> 1 ) <<  (h2 &  0x3F )
154+ 
155+             result_view[row_id] =  (bit_array[h1 >>  6 ] &  mask1) !=  0  and  \
156+                 (bit_array[h2 >>  6 ] &  mask2) !=  0 
141157
142158        return  result
143159
144160cpdef BloomFilter create_bloom_filter(object  relation, list  columns):
145161    """ 
146-     Create a BloomFilter from the specified `columns` in `relation`, 
147-     ignoring rows with nulls in any of the columns. 
162+     Optimized Bloom filter creation with better cache behavior. 
148163    """  
149164    cdef:
150165        Py_ssize_t num_rows =  relation.num_rows
@@ -153,16 +168,30 @@ cpdef BloomFilter create_bloom_filter(object relation, list columns):
153168        numpy.ndarray[numpy.uint64_t, ndim= 1 ] row_hashes_np =  numpy.empty(num_rows, dtype = numpy.uint64)
154169        uint64_t[::1 ] row_hashes =  row_hashes_np
155170        Py_ssize_t i
171+         int64_t row_id
156172        BloomFilter bf =  BloomFilter(num_valid_rows)
173+         uint64_t hash_val, h1, h2
157174
158175    if  num_valid_rows ==  0 :
159176        return  bf
160177
161178    #  Populate row hashes using the selected columns
162179    compute_row_hashes(relation, columns, row_hashes)
163180
181+     #  Precompute constants for faster access
182+     cdef uint64_t bit_mask =  bf.bit_mask
183+     cdef uint64_t golden_ratio =  GOLDEN_RATIO
184+     cdef uint64_t*  bit_array =  bf.bit_array
185+ 
164186    #  Add to bloom filter
165187    for  i in  range (num_valid_rows):
166-         bf._add(row_hashes[valid_row_ids[i]])
188+         row_id =  valid_row_ids[i]
189+         hash_val =  row_hashes[row_id]
190+ 
191+         h1 =  hash_val &  bit_mask
192+         h2 =  (hash_val *  golden_ratio) &  bit_mask
193+ 
194+         bit_array[h1 >>  6 ] |=  (< uint64_t> 1 ) <<  (h1 &  0x3F )
195+         bit_array[h2 >>  6 ] |=  (< uint64_t> 1 ) <<  (h2 &  0x3F )
167196
168197    return  bf
0 commit comments