@@ -23,6 +23,67 @@ inline uint32_t reduce(uint32_t hash, uint32_t n) {
2323 return (uint32_t )(((uint64_t )hash * n) >> 32 );
2424}
2525
26+ /* *
27+ * Given a value "word", produces an integer in [0,p) without division.
28+ * The function is as fair as possible in the sense that if you iterate
29+ * through all possible values of "word", then you will generate all
30+ * possible outputs as uniformly as possible.
31+ */
32+ static inline uint32_t fastrange32 (uint32_t word, uint32_t p) {
33+ // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
34+ return (uint32_t )(((uint64_t )word * (uint64_t )p) >> 32 );
35+ }
36+
37+ #if defined(_MSC_VER) && defined (_WIN64)
38+ #include < intrin.h> // should be part of all recent Visual Studio
39+ #pragma intrinsic(_umul128)
40+ #endif // defined(_MSC_VER) && defined (_WIN64)
41+
42+
43+ /* *
44+ * Given a value "word", produces an integer in [0,p) without division.
45+ * The function is as fair as possible in the sense that if you iterate
46+ * through all possible values of "word", then you will generate all
47+ * possible outputs as uniformly as possible.
48+ */
49+ static inline uint64_t fastrange64 (uint64_t word, uint64_t p) {
50+ // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
51+ #ifdef __SIZEOF_INT128__ // then we know we have a 128-bit int
52+ return (uint64_t )(((__uint128_t )word * (__uint128_t )p) >> 64 );
53+ #elif defined(_MSC_VER) && defined(_WIN64)
54+ // supported in Visual Studio 2005 and better
55+ uint64_t highProduct;
56+ _umul128 (word, p, &highProduct); // ignore output
57+ return highProduct;
58+ unsigned __int64 _umul128 (
59+ unsigned __int64 Multiplier,
60+ unsigned __int64 Multiplicand,
61+ unsigned __int64 *HighProduct
62+ );
63+ #else
64+ return word % p; // fallback
65+ #endif // __SIZEOF_INT128__
66+ }
67+
68+
69+ #ifndef UINT32_MAX
70+ #define UINT32_MAX (0xffffffff )
71+ #endif // UINT32_MAX
72+
73+ /* *
74+ * Given a value "word", produces an integer in [0,p) without division.
75+ * The function is as fair as possible in the sense that if you iterate
76+ * through all possible values of "word", then you will generate all
77+ * possible outputs as uniformly as possible.
78+ */
79+ static inline size_t fastrangesize (uint64_t word, size_t p) {
80+ #if (SIZE_MAX == UINT32_MAX)
81+ return (size_t )fastrange32 (word, p);
82+ #else // assume 64-bit
83+ return (size_t )fastrange64 (word, p);
84+ #endif // SIZE_MAX == UINT32_MAX
85+ }
86+
2687static size_t getBestK (size_t bitsPerItem) {
2788 return max (1 , (int )round ((double )bitsPerItem * log (2 )));
2889}
@@ -85,14 +146,14 @@ template <typename ItemType, size_t bits_per_item, bool branchless,
85146Status BloomFilter<ItemType, bits_per_item, branchless, HashFamily, k>::Add(
86147 const ItemType &key) {
87148 uint64_t hash = hasher (key);
88- uint32_t a = (uint32_t )( hash >> 32 );
89- uint32_t b = ( uint32_t ) hash;
149+ uint64_t a = (hash >> 32 ) | (hash << 32 );
150+ uint64_t b = hash;
90151 for (int i = 0 ; i < k; i++) {
91152 // int index = reduce(a, this->bitCount);
92153 // data[index >> 6] |= getBit(index);
93154 // reworked to avoid overflows
94155 // use the fact that reduce is not very sensitive to lower bits of a
95- data[reduce (a, this ->arrayLength )] |= getBit (a);
156+ data[fastrangesize (a, this ->arrayLength )] |= getBit (a);
96157 a += b;
97158 }
98159 return Ok;
@@ -118,10 +179,10 @@ Status BloomFilter<ItemType, bits_per_item, branchless, HashFamily, k>::AddAll(
118179 for (size_t i = start; i < end; i++) {
119180 uint64_t key = keys[i];
120181 uint64_t hash = hasher (key);
121- uint32_t a = (uint32_t )( hash >> 32 );
122- uint32_t b = ( uint32_t ) hash;
182+ uint64_t a = (hash >> 32 ) | (hash << 32 ); ;
183+ uint64_t b = hash;
123184 for (int j = 0 ; j < k; j++) {
124- int index = reduce (a, this ->arrayLength );
185+ int index = fastrangesize (a, this ->arrayLength );
125186 int block = index >> blockShift;
126187 int len = tmpLen[block];
127188 tmp[(block << blockShift) + len] = (index << 6 ) + (a & 63 );
@@ -149,27 +210,27 @@ template <typename ItemType, size_t bits_per_item, bool branchless,
149210Status BloomFilter<ItemType, bits_per_item, branchless, HashFamily, k>::Contain(
150211 const ItemType &key) const {
151212 uint64_t hash = hasher (key);
152- uint32_t a = (uint32_t )( hash >> 32 );
153- uint32_t b = ( uint32_t ) hash;
213+ uint64_t a = (hash >> 32 ) | (hash << 32 ); ;
214+ uint64_t b = hash;
154215 if (branchless && k >= 3 ) {
155- int b0 = data[reduce (a, this ->arrayLength )] >> (a & 63 );
216+ int b0 = data[fastrangesize (a, this ->arrayLength )] >> (a & 63 );
156217 a += b;
157- int b1 = data[reduce (a, this ->arrayLength )] >> (a & 63 );
218+ int b1 = data[fastrangesize (a, this ->arrayLength )] >> (a & 63 );
158219 a += b;
159- int b2 = data[reduce (a, this ->arrayLength )] >> (a & 63 );
220+ int b2 = data[fastrangesize (a, this ->arrayLength )] >> (a & 63 );
160221 if ((b0 & b1 & b2 & 1 ) == 0 ) {
161222 return NotFound;
162223 }
163224 for (int i = 3 ; i < k; i++) {
164225 a += b;
165- if (((data[reduce (a, this ->arrayLength )] >> (a & 63 )) & 1 ) == 0 ) {
226+ if (((data[fastrangesize (a, this ->arrayLength )] >> (a & 63 )) & 1 ) == 0 ) {
166227 return NotFound;
167228 }
168229 }
169230 return Ok;
170231 }
171232 for (int i = 0 ; i < k; i++) {
172- if ((data[reduce (a, this ->arrayLength )] & getBit (a)) == 0 ) {
233+ if ((data[fastrangesize (a, this ->arrayLength )] & getBit (a)) == 0 ) {
173234 return NotFound;
174235 }
175236 a += b;
0 commit comments