@@ -50,9 +50,58 @@ public final class BytesRefHash implements Accountable {
5050 private int hashSize ;
5151 private int hashHalfSize ;
5252 private int hashMask ;
53+ // This mask is used to extract the high bits from a hashcode
54+ private int highMask ;
5355 private int count ;
5456 private int lastCount = -1 ;
57+
58+ /**
59+ * The <code>ids</code> array serves a dual purpose:
60+ *
61+ * <ol>
62+ * <li>When the value is <code>-1</code>, it indicates an empty slot in the hash table.
63+ * <li>When the value is not <code>-1</code>, it stores:
64+ * <ul>
65+ * <li>The actual index into the <code>bytesStart</code> array (low bits, masked by <code>
66+ * hashMask</code>).
67+ * <li>The high bits of the original hashcode (high bits, masked by <code>highMask</code>
68+ * ).
69+ * </ul>
70+ * </ol>
71+ *
72+ * <p>This "trick" allows us to store both the index and part of the hashcode in a single int,
73+ * which speeds up hash collisions by quickly rejecting non-matching entries without having to
74+ * compare the actual byte values. During lookups, we can immediately check if the high bits match
75+ * before doing the more expensive byte comparison.
76+ *
77+ * <p><b>Example:</b>
78+ *
79+ * <ul>
80+ * <li>hashSize = 16, therefore <code>hashMask = 15</code> (<code>0x0000000F</code>)
81+ * <li><code>highMask = ~hashMask = 0xFFFFFFF0</code>
82+ * </ul>
83+ *
84+ * <p>When storing the value 7 with hashcode <code>0x12345678</code>:
85+ *
86+ * <ul>
87+ * <li>The low bits (index) are 7 (<code>0x00000007</code>)
88+ * <li>The high bits of hashcode are <code>0x12345670</code>
89+ * <li>The stored value becomes: <code>0x12345677</code>
90+ * </ul>
91+ *
92+ * <p><b>During lookup:</b>
93+ *
94+ * <ol>
95+ * <li>We compute the hashcode and find the slot.
96+ * <li>We extract the stored value's high bits (<code>& highMask</code>).
97+ * <li>If they match the lookup hashcode's high bits, we proceed to comparing actual bytes.
98+ * <li>Otherwise, we immediately know it's not a match and continue probing.
99+ * </ol>
100+ *
101+ * <p>This significantly improves performance for hash lookups, especially with many collisions.
102+ */
55103 private int [] ids ;
104+
56105 private final BytesStartArray bytesStartArray ;
57106 private final Counter bytesUsed ;
58107
@@ -71,9 +120,17 @@ public BytesRefHash(ByteBlockPool pool) {
71120
72121 /** Creates a new {@link BytesRefHash} */
73122 public BytesRefHash (ByteBlockPool pool , int capacity , BytesStartArray bytesStartArray ) {
123+ if (capacity <= 0 ) {
124+ throw new IllegalArgumentException ("capacity must be greater than 0" );
125+ }
126+
127+ if (BitUtil .isZeroOrPowerOfTwo (capacity ) == false ) {
128+ throw new IllegalArgumentException ("capacity must be a power of two, got " + capacity );
129+ }
74130 hashSize = capacity ;
75131 hashHalfSize = hashSize >> 1 ;
76132 hashMask = hashSize - 1 ;
133+ highMask = ~hashMask ;
77134 this .pool = new BytesRefBlockPool (pool );
78135 ids = new int [hashSize ];
79136 Arrays .fill (ids , -1 );
@@ -124,8 +181,8 @@ public int[] compact() {
124181 int upto = 0 ;
125182 for (int i = 0 ; i < hashSize ; i ++) {
126183 if (ids [i ] != -1 ) {
184+ ids [upto ] = ids [i ] & hashMask ;
127185 if (upto < i ) {
128- ids [upto ] = ids [i ];
129186 ids [i ] = -1 ;
130187 }
131188 upto ++;
@@ -232,6 +289,7 @@ private boolean shrink(int targetSize) {
232289 Arrays .fill (ids , -1 );
233290 hashHalfSize = newSize / 2 ;
234291 hashMask = newSize - 1 ;
292+ highMask = ~hashMask ;
235293 return true ;
236294 } else {
237295 return false ;
@@ -276,8 +334,9 @@ public void close() {
276334 */
277335 public int add (BytesRef bytes ) {
278336 assert bytesStart != null : "Bytesstart is null - not initialized" ;
337+ final int hashcode = doHash (bytes .bytes , bytes .offset , bytes .length );
279338 // final position
280- final int hashPos = findHash (bytes );
339+ final int hashPos = findHash (bytes , hashcode );
281340 int e = ids [hashPos ];
282341
283342 if (e == -1 ) {
@@ -289,13 +348,14 @@ public int add(BytesRef bytes) {
289348 bytesStart [count ] = pool .addBytesRef (bytes );
290349 e = count ++;
291350 assert ids [hashPos ] == -1 ;
292- ids [hashPos ] = e ;
351+ ids [hashPos ] = e | ( hashcode & highMask ) ;
293352
294353 if (count == hashHalfSize ) {
295354 rehash (2 * hashSize , true );
296355 }
297356 return e ;
298357 }
358+ e = e & hashMask ;
299359 return -(e + 1 );
300360 }
301361
@@ -306,25 +366,28 @@ public int add(BytesRef bytes) {
306366 * @return the id of the given bytes, or {@code -1} if there is no mapping for the given bytes.
307367 */
308368 public int find (BytesRef bytes ) {
309- return ids [findHash (bytes )];
369+ final int hashcode = doHash (bytes .bytes , bytes .offset , bytes .length );
370+ final int id = ids [findHash (bytes , hashcode )];
371+ return id == -1 ? -1 : id & hashMask ;
310372 }
311373
312- private int findHash (BytesRef bytes ) {
374+ private int findHash (BytesRef bytes , int hashcode ) {
313375 assert bytesStart != null : "bytesStart is null - not initialized" ;
376+ assert hashcode == doHash (bytes .bytes , bytes .offset , bytes .length );
314377
315- int code = doHash (bytes .bytes , bytes .offset , bytes .length );
316-
378+ int code = hashcode ;
317379 // final position
318380 int hashPos = code & hashMask ;
319381 int e = ids [hashPos ];
320- if (e != -1 && pool .equals (bytesStart [e ], bytes ) == false ) {
321- // Conflict; use linear probe to find an open slot
322- // (see LUCENE-5604):
323- do {
324- code ++;
325- hashPos = code & hashMask ;
326- e = ids [hashPos ];
327- } while (e != -1 && pool .equals (bytesStart [e ], bytes ) == false );
382+ final int highBits = hashcode & highMask ;
383+
384+ // Conflict; use linear probe to find an open slot
385+ // (see LUCENE-5604):
386+ while (e != -1
387+ && ((e & highMask ) != highBits || pool .equals (bytesStart [e & hashMask ], bytes ) == false )) {
388+ code ++;
389+ hashPos = code & hashMask ;
390+ e = ids [hashPos ];
328391 }
329392
330393 return hashPos ;
@@ -342,14 +405,13 @@ public int addByPoolOffset(int offset) {
342405 int code = offset ;
343406 int hashPos = offset & hashMask ;
344407 int e = ids [hashPos ];
345- if (e != -1 && bytesStart [e ] != offset ) {
346- // Conflict; use linear probe to find an open slot
347- // (see LUCENE-5604):
348- do {
349- code ++;
350- hashPos = code & hashMask ;
351- e = ids [hashPos ];
352- } while (e != -1 && bytesStart [e ] != offset );
408+
409+ // Conflict; use linear probe to find an open slot
410+ // (see LUCENE-5604):
411+ while (e != -1 && bytesStart [e ] != offset ) {
412+ code ++;
413+ hashPos = code & hashMask ;
414+ e = ids [hashPos ];
353415 }
354416 if (e == -1 ) {
355417 // new entry
@@ -375,34 +437,39 @@ public int addByPoolOffset(int offset) {
375437 */
376438 private void rehash (final int newSize , boolean hashOnData ) {
377439 final int newMask = newSize - 1 ;
440+ final int newHighMask = ~newMask ;
378441 bytesUsed .addAndGet (Integer .BYTES * (long ) newSize );
379442 final int [] newHash = new int [newSize ];
380443 Arrays .fill (newHash , -1 );
381444 for (int i = 0 ; i < hashSize ; i ++) {
382- final int e0 = ids [i ];
445+ int e0 = ids [i ];
383446 if (e0 != -1 ) {
447+ e0 &= hashMask ;
448+ final int hashcode ;
384449 int code ;
385450 if (hashOnData ) {
386- code = pool .hash (bytesStart [e0 ]);
451+ hashcode = code = pool .hash (bytesStart [e0 ]);
387452 } else {
388453 code = bytesStart [e0 ];
454+ hashcode = 0 ;
389455 }
390456
391457 int hashPos = code & newMask ;
392458 assert hashPos >= 0 ;
393- if (newHash [hashPos ] != -1 ) {
394- // Conflict; use linear probe to find an open slot
395- // (see LUCENE-5604):
396- do {
397- code ++;
398- hashPos = code & newMask ;
399- } while (newHash [hashPos ] != -1 );
459+
460+ // Conflict; use linear probe to find an open slot
461+ // (see LUCENE-5604):
462+ while (newHash [hashPos ] != -1 ) {
463+ code ++;
464+ hashPos = code & newMask ;
400465 }
401- newHash [hashPos ] = e0 ;
466+
467+ newHash [hashPos ] = e0 | (hashcode & newHighMask );
402468 }
403469 }
404470
405471 hashMask = newMask ;
472+ highMask = newHighMask ;
406473 bytesUsed .addAndGet (Integer .BYTES * (long ) -ids .length );
407474 ids = newHash ;
408475 hashSize = newSize ;
0 commit comments