dungba88 · dungba88 · Apr 3, 2024
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java
@@ -98,8 +98,8 @@ public class FSTCompiler<T> {
   // it will throw exceptions if attempt to call getReverseBytesReader() or writeTo(DataOutput)
   private static final FSTReader NULL_FST_READER = new NullFSTReader();
 
-  private final NodeHash<T> dedupHash;
-  // a temporary FST used during building for NodeHash cache
+  private final FSTSuffixNodeCache<T> suffixDedupCache;
+  // a temporary FST used during building for FSTSuffixNodeCache cache
   final FST<T> fst;
   private final T NO_OUTPUT;
 
@@ -178,9 +178,9 @@ private FSTCompiler(
     if (suffixRAMLimitMB < 0) {
       throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
     } else if (suffixRAMLimitMB > 0) {
-      dedupHash = new NodeHash<>(this, suffixRAMLimitMB);
+      suffixDedupCache = new FSTSuffixNodeCache<>(this, suffixRAMLimitMB);
     } else {
-      dedupHash = null;
+      suffixDedupCache = null;
     }
     NO_OUTPUT = outputs.getNoOutput();
 
@@ -379,12 +379,12 @@ public long getArcCount() {
   private CompiledNode compileNode(UnCompiledNode<T> nodeIn) throws IOException {
     final long node;
     long bytesPosStart = numBytesWritten;
-    if (dedupHash != null) {
+    if (suffixDedupCache != null) {
       if (nodeIn.numArcs == 0) {
         node = addNode(nodeIn);
         lastFrozenNode = node;
       } else {
-        node = dedupHash.add(nodeIn);
+        node = suffixDedupCache.add(nodeIn);
       }
     } else {
       node = addNode(nodeIn);

diff --git a/.../org/apache/lucene/util/fst/NodeHash.java → ...e/lucene/util/fst/FSTSuffixNodeCache.java b/.../org/apache/lucene/util/fst/NodeHash.java → ...e/lucene/util/fst/FSTSuffixNodeCache.java
@@ -31,8 +31,24 @@
 // TODO: couldn't we prune naturally back until we see a transition with an output?  it's highly
 // unlikely (mostly impossible) such suffixes can be shared?
 
-// Used to dedup states (lookup already-frozen states)
-final class NodeHash<T> {
+/**
+ * This is essentially a LRU cache to maintain and lookup node suffix. Un-compiled node can be added
+ * into the cache and if a similar node exists we will return its address in the FST. A node is
+ * defined as similar if it has the same label, arcs, outputs & other properties that identify a
+ * node.
+ *
+ * <p>The total size of the cache is controlled through the constructor parameter <code>ramLimitMB
+ * </code> Implementation-wise, we maintain two lookup tables, a primary table where node can be
+ * looked up from, and a fallback lookup table in case the lookup in the primary table fails. Nodes
+ * from the fallback table can also be promoted to the primary table when that happens. When the
+ * primary table is full, we swap it with the fallback table and clear out the primary table.
+ *
+ * <p>To lookup the node address, we build a special hash table which maps from the Node hash value
+ * to the Node address in the FST, called <code>PagedGrowableHash</code>. Internally it uses {@link
+ * PagedGrowableWriter} to store the mapping, which allows efficient packing the hash & address long
+ * values, and uses {@link ByteBlockPool} to store the actual node content (arcs & outputs).
+ */
+final class FSTSuffixNodeCache<T> {
 
   // primary table -- we add nodes into this until it reaches the requested tableSizeLimit/2, then
   // we move it to fallback
@@ -60,7 +76,7 @@ final class NodeHash<T> {
    * recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger
    * ramLimitMB will make the FST smaller (closer to minimal).
    */
-  public NodeHash(FSTCompiler<T> fstCompiler, double ramLimitMB) {
+  public FSTSuffixNodeCache(FSTCompiler<T> fstCompiler, double ramLimitMB) {
     if (ramLimitMB <= 0) {
       throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB);
     }

diff --git a/.../apache/lucene/util/fst/TestNodeHash.java → ...cene/util/fst/TestFSTSuffixNodeCache.java b/.../apache/lucene/util/fst/TestNodeHash.java → ...cene/util/fst/TestFSTSuffixNodeCache.java
@@ -19,14 +19,16 @@
 import com.carrotsearch.randomizedtesting.generators.RandomBytes;
 import org.apache.lucene.tests.util.LuceneTestCase;
 
-public class TestNodeHash extends LuceneTestCase {
+public class TestFSTSuffixNodeCache extends LuceneTestCase {
 
   public void testCopyFallbackNodeBytes() {
     // we don't need the FSTCompiler in this test
-    NodeHash<Object> nodeHash = new NodeHash<>(null, 1);
+    FSTSuffixNodeCache<Object> suffixCache = new FSTSuffixNodeCache<>(null, 1);
 
-    NodeHash<Object>.PagedGrowableHash primaryHashTable = nodeHash.new PagedGrowableHash();
-    NodeHash<Object>.PagedGrowableHash fallbackHashTable = nodeHash.new PagedGrowableHash();
+    FSTSuffixNodeCache<Object>.PagedGrowableHash primaryHashTable =
+        suffixCache.new PagedGrowableHash();
+    FSTSuffixNodeCache<Object>.PagedGrowableHash fallbackHashTable =
+        suffixCache.new PagedGrowableHash();
     int nodeLength = atLeast(500);
     long fallbackHashSlot = 1;
     byte[] fallbackBytes = RandomBytes.randomBytesOfLength(random(), nodeLength);