@@ -253,44 +253,53 @@ fn mergeIntersection(
253253 return out_idx ;
254254}
255255
256- // TODO: The pow2 requirement is quite inefficient: explore ideas posted in
257- // https://databasearchitects.blogspot.com/2019/07/cuckoo-filters-with-arbitrarily-sized.html
258- // (rocksdb even-odd scheme from comments looks interesting).
256+ fn parity (integer : anytype ) enum (u1 ) { even , odd } {
257+ return @enumFromInt (integer & 1 );
258+ }
259+
259260pub const CuckooFilter = struct {
260- /// len must be a power of 2.
261- ///
262- /// ### Pathological case with buckets.len power of 2
263- ///
264- /// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_2`
265- /// - `BucketIndex(alias_1)` -> `bucket_1`, `BucketIndex(alias_1).alternate()` -> `bucket_2`
266- ///
267- /// Our alternate mappings hold and `contains()` will not return false negatives.
268- ///
269- /// ### Pathological case with buckets.len NOT power of 2:
270- ///
271- /// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_3`
272- /// - `BucketIndex(alias_1)` -> `bucket_2`, `BucketIndex(alias_1).alternate()` -> `bucket_4`
273- ///
274- /// Our alternate mappings do not hold and `contains()` can return false negatives. This is not
275- /// acceptable as the entire point of an AMQ datastructure is the presence of false positives
276- /// but not false negatives.
277261 buckets : []Bucket ,
278262
279263 pub const Fingerprint = enum (u8 ) {
280264 none = std .math .maxInt (u8 ),
281265 _ ,
282266
283- pub fn hash (fingerprint : Fingerprint ) u32 {
284- return @truncate (std .hash .Murmur2_64 .hash (&.{@intFromEnum (fingerprint )}));
267+ const precomputed_odd_hashes = blk : {
268+ var table : [255 ]u32 = undefined ;
269+
270+ for (& table , 0.. ) | * h , index | {
271+ h .* = @truncate (std .hash .Murmur2_64 .hash (&.{index }) | 1 );
272+ }
273+
274+ break :blk table ;
275+ };
276+
277+ pub fn oddHash (fingerprint : Fingerprint ) u32 {
278+ assert (fingerprint != .none );
279+ return precomputed_odd_hashes [@intFromEnum (fingerprint )];
285280 }
286281 };
282+
287283 pub const Bucket = [4 ]Fingerprint ;
288284 pub const BucketIndex = enum (u32 ) {
289285 _ ,
290286
291- pub fn alternate (index : BucketIndex , fingerprint : Fingerprint ) BucketIndex {
287+ pub fn alternate (index : BucketIndex , fingerprint : Fingerprint , len : u32 ) BucketIndex {
288+ assert (@intFromEnum (index ) < len );
292289 assert (fingerprint != .none );
293- return @enumFromInt (@intFromEnum (index ) ^ fingerprint .hash ());
290+
291+ const signed_index : i64 = @intFromEnum (index );
292+ const odd_hash : i64 = fingerprint .oddHash ();
293+
294+ const unbounded = switch (parity (signed_index )) {
295+ .even = > signed_index + odd_hash ,
296+ .odd = > signed_index - odd_hash ,
297+ };
298+ const bounded : u32 = @intCast (@mod (unbounded , len ));
299+
300+ assert (parity (signed_index ) != parity (bounded ));
301+
302+ return @enumFromInt (bounded );
294303 }
295304 };
296305
@@ -299,41 +308,46 @@ pub const CuckooFilter = struct {
299308 index_1 : BucketIndex ,
300309 index_2 : BucketIndex ,
301310
302- pub fn initFromTrigram (trigram : Trigram ) Triplet {
311+ pub fn initFromTrigram (trigram : Trigram , len : u32 ) Triplet {
303312 const split : packed struct {
304313 fingerprint : Fingerprint ,
305314 padding : u24 ,
306- index_1 : BucketIndex ,
315+ index_1 : u32 ,
307316 } = @bitCast (std .hash .Murmur2_64 .hash (& trigram ));
308317
318+ const index_1 : BucketIndex = @enumFromInt (split .index_1 % len );
319+
309320 const fingerprint : Fingerprint = if (split .fingerprint == .none )
310- @enumFromInt (0 )
321+ @enumFromInt (1 )
311322 else
312323 split .fingerprint ;
313324
314325 const triplet : Triplet = .{
315326 .fingerprint = fingerprint ,
316- .index_1 = split . index_1 ,
317- .index_2 = split . index_1 .alternate (fingerprint ),
327+ .index_1 = index_1 ,
328+ .index_2 = index_1 .alternate (fingerprint , len ),
318329 };
319- assert (triplet .index_2 .alternate (fingerprint ) == triplet . index_1 );
330+ assert (triplet .index_2 .alternate (fingerprint , len ) == index_1 );
320331
321332 return triplet ;
322333 }
323334 };
324335
336+ pub fn init (buckets : []Bucket ) CuckooFilter {
337+ assert (parity (buckets .len ) == .even );
338+ return .{ .buckets = buckets };
339+ }
340+
325341 pub fn reset (filter : CuckooFilter ) void {
326- @memset (filter .buckets , [1 ]Fingerprint {.none } ** 4 );
342+ @memset (filter .buckets , [1 ]Fingerprint {.none } ** @typeInfo ( Bucket ). array . len );
327343 }
328344
329- pub fn capacityForCount (count : usize ) error {Overflow }! usize {
330- const fill_rate = 0.95 ;
331- return try std .math .ceilPowerOfTwo (usize , @intFromFloat (@ceil (@as (f32 , @floatFromInt (count )) / fill_rate )));
345+ pub fn capacityForCount (count : u32 ) error {Overflow }! u32 {
346+ return count + (count & 1 );
332347 }
333348
334- // Use a hash (fnv) for randomness.
335349 pub fn append (filter : CuckooFilter , random : std.Random , trigram : Trigram ) error {EvictionFailed }! void {
336- const triplet : Triplet = .initFromTrigram (trigram );
350+ const triplet : Triplet = .initFromTrigram (trigram , @intCast ( filter . buckets . len ) );
337351
338352 if (filter .appendToBucket (triplet .index_1 , triplet .fingerprint ) or
339353 filter .appendToBucket (triplet .index_2 , triplet .fingerprint ))
@@ -345,7 +359,7 @@ pub const CuckooFilter = struct {
345359 var index = if (random .boolean ()) triplet .index_1 else triplet .index_2 ;
346360 for (0.. 500) | _ | {
347361 fingerprint = filter .swapFromBucket (random , index , fingerprint );
348- index = index .alternate (fingerprint );
362+ index = index .alternate (fingerprint , @intCast ( filter . buckets . len ) );
349363
350364 if (filter .appendToBucket (index , fingerprint )) {
351365 return ;
@@ -356,8 +370,7 @@ pub const CuckooFilter = struct {
356370 }
357371
358372 fn bucketAt (filter : CuckooFilter , index : BucketIndex ) * Bucket {
359- assert (std .math .isPowerOfTwo (filter .buckets .len ));
360- return & filter .buckets [@intFromEnum (index ) & (filter .buckets .len - 1 )];
373+ return & filter .buckets [@intFromEnum (index )];
361374 }
362375
363376 fn appendToBucket (filter : CuckooFilter , index : BucketIndex , fingerprint : Fingerprint ) bool {
@@ -382,6 +395,7 @@ pub const CuckooFilter = struct {
382395 ) Fingerprint {
383396 assert (fingerprint != .none );
384397
398+ comptime assert (@typeInfo (Bucket ).array .len == 4 );
385399 const target = & filter .bucketAt (index )[random .int (u2 )];
386400
387401 const old_fingerprint = target .* ;
@@ -393,7 +407,7 @@ pub const CuckooFilter = struct {
393407 }
394408
395409 pub fn contains (filter : CuckooFilter , trigram : Trigram ) bool {
396- const triplet : Triplet = .initFromTrigram (trigram );
410+ const triplet : Triplet = .initFromTrigram (trigram , @intCast ( filter . buckets . len ) );
397411
398412 return filter .containsInBucket (triplet .index_1 , triplet .fingerprint ) or
399413 filter .containsInBucket (triplet .index_2 , triplet .fingerprint );
@@ -417,16 +431,15 @@ pub const CuckooFilter = struct {
417431test CuckooFilter {
418432 const allocator = std .testing .allocator ;
419433
420- const element_count = 486 ;
434+ const element_count = 499 ;
421435 const filter_size = comptime CuckooFilter .capacityForCount (element_count ) catch unreachable ;
422- try std .testing .expectEqual (512 , filter_size );
423436
424437 var entries : std .AutoArrayHashMapUnmanaged (Trigram , void ) = .empty ;
425438 defer entries .deinit (allocator );
426439 try entries .ensureTotalCapacity (allocator , element_count );
427440
428441 var buckets : [filter_size ]CuckooFilter.Bucket = undefined ;
429- var filter : CuckooFilter = .{ . buckets = & buckets } ;
442+ var filter : CuckooFilter = .init ( & buckets ) ;
430443 var filter_prng : std.Random.DefaultPrng = .init (42 );
431444
432445 for (0.. 2_500) | gen_prng_seed | {
0 commit comments