Skip to content

Commit 2f66c8f

Browse files
Cache high-order bits of hashcode to speed up BytesRefHash (#14720)
1 parent 50b4363 commit 2f66c8f

File tree

2 files changed

+102
-33
lines changed

2 files changed

+102
-33
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ Optimizations
138138

139139
* GITHUB#14674: Optimize AbstractKnnVectorQuery#createBitSet with intoBitset. (Guo Feng)
140140

141+
* GITHUB#14720: Cache high-order bits of hashcode to speed up BytesRefHash. (Pan Guixin)
142+
141143
Bug Fixes
142144
---------------------
143145
* GITHUB#14654: ValueSource.fromDoubleValuesSource(dvs).getSortField() would throw errors when

lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java

Lines changed: 100 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,58 @@ public final class BytesRefHash implements Accountable {
5050
private int hashSize;
5151
private int hashHalfSize;
5252
private int hashMask;
53+
// This mask is used to extract the high bits from a hashcode
54+
private int highMask;
5355
private int count;
5456
private int lastCount = -1;
57+
58+
/**
59+
* The <code>ids</code> array serves a dual purpose:
60+
*
61+
* <ol>
62+
* <li>When the value is <code>-1</code>, it indicates an empty slot in the hash table.
63+
* <li>When the value is not <code>-1</code>, it stores:
64+
* <ul>
65+
* <li>The actual index into the <code>bytesStart</code> array (low bits, masked by <code>
66+
* hashMask</code>).
67+
* <li>The high bits of the original hashcode (high bits, masked by <code>highMask</code>
68+
* ).
69+
* </ul>
70+
* </ol>
71+
*
72+
* <p>This "trick" allows us to store both the index and part of the hashcode in a single int,
73+
* which speeds up hash collisions by quickly rejecting non-matching entries without having to
74+
* compare the actual byte values. During lookups, we can immediately check if the high bits match
75+
* before doing the more expensive byte comparison.
76+
*
77+
* <p><b>Example:</b>
78+
*
79+
* <ul>
80+
* <li>hashSize = 16, therefore <code>hashMask = 15</code> (<code>0x0000000F</code>)
81+
* <li><code>highMask = ~hashMask = 0xFFFFFFF0</code>
82+
* </ul>
83+
*
84+
* <p>When storing the value 7 with hashcode <code>0x12345678</code>:
85+
*
86+
* <ul>
87+
* <li>The low bits (index) are 7 (<code>0x00000007</code>)
88+
* <li>The high bits of hashcode are <code>0x12345670</code>
89+
* <li>The stored value becomes: <code>0x12345677</code>
90+
* </ul>
91+
*
92+
* <p><b>During lookup:</b>
93+
*
94+
* <ol>
95+
* <li>We compute the hashcode and find the slot.
96+
* <li>We extract the stored value's high bits (<code>& highMask</code>).
97+
* <li>If they match the lookup hashcode's high bits, we proceed to comparing actual bytes.
98+
* <li>Otherwise, we immediately know it's not a match and continue probing.
99+
* </ol>
100+
*
101+
* <p>This significantly improves performance for hash lookups, especially with many collisions.
102+
*/
55103
private int[] ids;
104+
56105
private final BytesStartArray bytesStartArray;
57106
private final Counter bytesUsed;
58107

@@ -71,9 +120,17 @@ public BytesRefHash(ByteBlockPool pool) {
71120

72121
/** Creates a new {@link BytesRefHash} */
73122
public BytesRefHash(ByteBlockPool pool, int capacity, BytesStartArray bytesStartArray) {
123+
if (capacity <= 0) {
124+
throw new IllegalArgumentException("capacity must be greater than 0");
125+
}
126+
127+
if (BitUtil.isZeroOrPowerOfTwo(capacity) == false) {
128+
throw new IllegalArgumentException("capacity must be a power of two, got " + capacity);
129+
}
74130
hashSize = capacity;
75131
hashHalfSize = hashSize >> 1;
76132
hashMask = hashSize - 1;
133+
highMask = ~hashMask;
77134
this.pool = new BytesRefBlockPool(pool);
78135
ids = new int[hashSize];
79136
Arrays.fill(ids, -1);
@@ -124,8 +181,8 @@ public int[] compact() {
124181
int upto = 0;
125182
for (int i = 0; i < hashSize; i++) {
126183
if (ids[i] != -1) {
184+
ids[upto] = ids[i] & hashMask;
127185
if (upto < i) {
128-
ids[upto] = ids[i];
129186
ids[i] = -1;
130187
}
131188
upto++;
@@ -232,6 +289,7 @@ private boolean shrink(int targetSize) {
232289
Arrays.fill(ids, -1);
233290
hashHalfSize = newSize / 2;
234291
hashMask = newSize - 1;
292+
highMask = ~hashMask;
235293
return true;
236294
} else {
237295
return false;
@@ -276,8 +334,9 @@ public void close() {
276334
*/
277335
public int add(BytesRef bytes) {
278336
assert bytesStart != null : "Bytesstart is null - not initialized";
337+
final int hashcode = doHash(bytes.bytes, bytes.offset, bytes.length);
279338
// final position
280-
final int hashPos = findHash(bytes);
339+
final int hashPos = findHash(bytes, hashcode);
281340
int e = ids[hashPos];
282341

283342
if (e == -1) {
@@ -289,13 +348,14 @@ public int add(BytesRef bytes) {
289348
bytesStart[count] = pool.addBytesRef(bytes);
290349
e = count++;
291350
assert ids[hashPos] == -1;
292-
ids[hashPos] = e;
351+
ids[hashPos] = e | (hashcode & highMask);
293352

294353
if (count == hashHalfSize) {
295354
rehash(2 * hashSize, true);
296355
}
297356
return e;
298357
}
358+
e = e & hashMask;
299359
return -(e + 1);
300360
}
301361

@@ -306,25 +366,28 @@ public int add(BytesRef bytes) {
306366
* @return the id of the given bytes, or {@code -1} if there is no mapping for the given bytes.
307367
*/
308368
public int find(BytesRef bytes) {
309-
return ids[findHash(bytes)];
369+
final int hashcode = doHash(bytes.bytes, bytes.offset, bytes.length);
370+
final int id = ids[findHash(bytes, hashcode)];
371+
return id == -1 ? -1 : id & hashMask;
310372
}
311373

312-
private int findHash(BytesRef bytes) {
374+
private int findHash(BytesRef bytes, int hashcode) {
313375
assert bytesStart != null : "bytesStart is null - not initialized";
376+
assert hashcode == doHash(bytes.bytes, bytes.offset, bytes.length);
314377

315-
int code = doHash(bytes.bytes, bytes.offset, bytes.length);
316-
378+
int code = hashcode;
317379
// final position
318380
int hashPos = code & hashMask;
319381
int e = ids[hashPos];
320-
if (e != -1 && pool.equals(bytesStart[e], bytes) == false) {
321-
// Conflict; use linear probe to find an open slot
322-
// (see LUCENE-5604):
323-
do {
324-
code++;
325-
hashPos = code & hashMask;
326-
e = ids[hashPos];
327-
} while (e != -1 && pool.equals(bytesStart[e], bytes) == false);
382+
final int highBits = hashcode & highMask;
383+
384+
// Conflict; use linear probe to find an open slot
385+
// (see LUCENE-5604):
386+
while (e != -1
387+
&& ((e & highMask) != highBits || pool.equals(bytesStart[e & hashMask], bytes) == false)) {
388+
code++;
389+
hashPos = code & hashMask;
390+
e = ids[hashPos];
328391
}
329392

330393
return hashPos;
@@ -342,14 +405,13 @@ public int addByPoolOffset(int offset) {
342405
int code = offset;
343406
int hashPos = offset & hashMask;
344407
int e = ids[hashPos];
345-
if (e != -1 && bytesStart[e] != offset) {
346-
// Conflict; use linear probe to find an open slot
347-
// (see LUCENE-5604):
348-
do {
349-
code++;
350-
hashPos = code & hashMask;
351-
e = ids[hashPos];
352-
} while (e != -1 && bytesStart[e] != offset);
408+
409+
// Conflict; use linear probe to find an open slot
410+
// (see LUCENE-5604):
411+
while (e != -1 && bytesStart[e] != offset) {
412+
code++;
413+
hashPos = code & hashMask;
414+
e = ids[hashPos];
353415
}
354416
if (e == -1) {
355417
// new entry
@@ -375,34 +437,39 @@ public int addByPoolOffset(int offset) {
375437
*/
376438
private void rehash(final int newSize, boolean hashOnData) {
377439
final int newMask = newSize - 1;
440+
final int newHighMask = ~newMask;
378441
bytesUsed.addAndGet(Integer.BYTES * (long) newSize);
379442
final int[] newHash = new int[newSize];
380443
Arrays.fill(newHash, -1);
381444
for (int i = 0; i < hashSize; i++) {
382-
final int e0 = ids[i];
445+
int e0 = ids[i];
383446
if (e0 != -1) {
447+
e0 &= hashMask;
448+
final int hashcode;
384449
int code;
385450
if (hashOnData) {
386-
code = pool.hash(bytesStart[e0]);
451+
hashcode = code = pool.hash(bytesStart[e0]);
387452
} else {
388453
code = bytesStart[e0];
454+
hashcode = 0;
389455
}
390456

391457
int hashPos = code & newMask;
392458
assert hashPos >= 0;
393-
if (newHash[hashPos] != -1) {
394-
// Conflict; use linear probe to find an open slot
395-
// (see LUCENE-5604):
396-
do {
397-
code++;
398-
hashPos = code & newMask;
399-
} while (newHash[hashPos] != -1);
459+
460+
// Conflict; use linear probe to find an open slot
461+
// (see LUCENE-5604):
462+
while (newHash[hashPos] != -1) {
463+
code++;
464+
hashPos = code & newMask;
400465
}
401-
newHash[hashPos] = e0;
466+
467+
newHash[hashPos] = e0 | (hashcode & newHighMask);
402468
}
403469
}
404470

405471
hashMask = newMask;
472+
highMask = newHighMask;
406473
bytesUsed.addAndGet(Integer.BYTES * (long) -ids.length);
407474
ids = newHash;
408475
hashSize = newSize;

0 commit comments

Comments
 (0)