3131import java .io .IOException ;
3232import org .apache .lucene .store .ByteArrayDataOutput ;
3333import org .apache .lucene .store .DataOutput ;
34+ import org .apache .lucene .util .Accountable ;
3435import org .apache .lucene .util .ArrayUtil ;
3536import org .apache .lucene .util .IntsRef ;
3637import org .apache .lucene .util .IntsRefBuilder ;
38+ import org .apache .lucene .util .RamUsageEstimator ;
3739import org .apache .lucene .util .fst .FST .INPUT_TYPE ; // javadoc
3840
3941// TODO: could we somehow stream an FST to disk while we
@@ -83,7 +85,7 @@ public class FSTCompiler<T> {
8385 */
8486 private static final float DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR = 1.66f ;
8587
86- private final NodeHash <T > dedupHash ;
88+ final NodeHash <T > dedupHash ;
8789 final FST <T > fst ;
8890 private final T NO_OUTPUT ;
8991
@@ -145,7 +147,7 @@ private FSTCompiler(
145147 if (suffixRAMLimitMB < 0 ) {
146148 throw new IllegalArgumentException ("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB );
147149 } else if (suffixRAMLimitMB > 0 ) {
148- dedupHash = new NodeHash <>(this , suffixRAMLimitMB , bytes . getReverseReader ( false ) );
150+ dedupHash = new NodeHash <>(this , suffixRAMLimitMB );
149151 } else {
150152 dedupHash = null ;
151153 }
@@ -270,10 +272,6 @@ public float getDirectAddressingMaxOversizingFactor() {
270272 return directAddressingMaxOversizingFactor ;
271273 }
272274
273- public long getTermCount () {
274- return frontier [0 ].inputCount ;
275- }
276-
277275 public long getNodeCount () {
278276 // 1+ in order to count the -1 implicit final node
279277 return 1 + nodeCount ;
@@ -292,13 +290,13 @@ private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throw
292290 long bytesPosStart = bytes .getPosition ();
293291 if (dedupHash != null ) {
294292 if (nodeIn .numArcs == 0 ) {
295- node = addNode (nodeIn );
293+ node = addNode (nodeIn ). nodeAddress ;
296294 lastFrozenNode = node ;
297295 } else {
298296 node = dedupHash .add (nodeIn );
299297 }
300298 } else {
301- node = addNode (nodeIn );
299+ node = addNode (nodeIn ). nodeAddress ;
302300 }
303301 assert node != -2 ;
304302
@@ -318,13 +316,13 @@ private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throw
318316
319317 // serializes new node by appending its bytes to the end
320318 // of the current byte[]
321- long addNode (FSTCompiler .UnCompiledNode <T > nodeIn ) throws IOException {
319+ NodeAndBuffer addNode (FSTCompiler .UnCompiledNode <T > nodeIn ) throws IOException {
322320 // System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
323321 if (nodeIn .numArcs == 0 ) {
324322 if (nodeIn .isFinal ) {
325- return FINAL_END_NODE ;
323+ return new NodeAndBuffer ( FINAL_END_NODE , null ) ;
326324 } else {
327- return NON_FINAL_END_NODE ;
325+ return new NodeAndBuffer ( NON_FINAL_END_NODE , null ) ;
328326 }
329327 }
330328 final long startAddress = bytes .getPosition ();
@@ -461,7 +459,13 @@ long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
461459 final long thisNodeAddress = bytes .getPosition () - 1 ;
462460 bytes .reverse (startAddress , thisNodeAddress );
463461 nodeCount ++;
464- return thisNodeAddress ;
462+ byte [] buf = new byte [Math .toIntExact (thisNodeAddress - startAddress + 1 )];
463+ bytes .copyBytes (startAddress , buf , 0 , buf .length );
464+ return new NodeAndBuffer (thisNodeAddress , buf );
465+ }
466+
467+ record NodeAndBuffer (long nodeAddress , byte [] bytes ) {
468+
465469 }
466470
467471 private void writeLabel (DataOutput out , int v ) throws IOException {
@@ -749,7 +753,6 @@ public void add(IntsRef input, T output) throws IOException {
749753 // format cannot represent the empty input since
750754 // 'finalness' is stored on the incoming arc, not on
751755 // the node
752- frontier [0 ].inputCount ++;
753756 frontier [0 ].isFinal = true ;
754757 fst .setEmptyOutput (output );
755758 return ;
@@ -760,9 +763,6 @@ public void add(IntsRef input, T output) throws IOException {
760763 int pos2 = input .offset ;
761764 final int pos1Stop = Math .min (lastInput .length (), input .length );
762765 while (true ) {
763- frontier [pos1 ].inputCount ++;
764- // System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" +
765- // frontier[pos1]);
766766 if (pos1 >= pos1Stop || lastInput .intAt (pos1 ) != input .ints [pos2 ]) {
767767 break ;
768768 }
@@ -786,7 +786,6 @@ public void add(IntsRef input, T output) throws IOException {
786786 // init tail states for current input
787787 for (int idx = prefixLenPlus1 ; idx <= input .length ; idx ++) {
788788 frontier [idx - 1 ].addArc (input .ints [input .offset + idx - 1 ], frontier [idx ]);
789- frontier [idx ].inputCount ++;
790789 }
791790
792791 final UnCompiledNode <T > lastNode = frontier [input .length ];
@@ -835,8 +834,6 @@ public void add(IntsRef input, T output) throws IOException {
835834
836835 // save last input
837836 lastInput .copyInts (input );
838-
839- // System.out.println(" count[0]=" + frontier[0].inputCount);
840837 }
841838
842839 private boolean validOutput (T output ) {
@@ -866,6 +863,7 @@ public FST<T> compile() throws IOException {
866863
867864 /** Expert: holds a pending (seen but not yet serialized) arc. */
868865 static class Arc <T > {
866+
869867 int label ; // really an "unsigned" byte
870868 Node target ;
871869 boolean isFinal ;
@@ -895,7 +893,13 @@ public boolean isCompiled() {
895893 }
896894
897895 /** Expert: holds a pending (seen but not yet serialized) Node. */
898- static final class UnCompiledNode <T > implements Node {
896+ static final class UnCompiledNode <T > implements Node , Accountable {
897+
898+ private static final long BASE_RAM_BYTES_USED =
899+ RamUsageEstimator .shallowSizeOfInstance (UnCompiledNode .class );
900+ private static final long BASE_ARC_RAM_BYTES_USED =
901+ RamUsageEstimator .shallowSizeOfInstance (Arc .class );
902+
899903 final FSTCompiler <T > owner ;
900904 int numArcs ;
901905 Arc <T >[] arcs ;
@@ -906,10 +910,6 @@ static final class UnCompiledNode<T> implements Node {
906910 T output ;
907911 boolean isFinal ;
908912
909- // TODO: remove this tracking? we used to use it for confusingly pruning NodeHash, but
910- // we switched to LRU by RAM usage instead:
911- long inputCount ;
912-
913913 /** This node's depth, starting from the automaton root. */
914914 final int depth ;
915915
@@ -931,11 +931,79 @@ public boolean isCompiled() {
931931 return false ;
932932 }
933933
934+ @ Override
935+ public int hashCode () {
936+ final int PRIME = 31 ;
937+ long h = 0 ;
938+ // TODO: maybe if number of arcs is high we can safely subsample?
939+ for (int arcIdx = 0 ; arcIdx < numArcs ; arcIdx ++) {
940+ final FSTCompiler .Arc <T > arc = arcs [arcIdx ];
941+ h = PRIME * h + arc .label ;
942+ long n = ((FSTCompiler .CompiledNode ) arc .target ).node ;
943+ h = PRIME * h + (int ) (n ^ (n >> 32 ));
944+ h = PRIME * h + arc .output .hashCode ();
945+ h = PRIME * h + arc .nextFinalOutput .hashCode ();
946+ if (arc .isFinal ) {
947+ h += 17 ;
948+ }
949+ }
950+
951+ return Long .hashCode (h );
952+ }
953+
954+ @ Override
955+ public boolean equals (Object obj ) {
956+ if ((obj instanceof FSTCompiler .UnCompiledNode ) == false ) {
957+ return false ;
958+ }
959+ @ SuppressWarnings ("unchecked" )
960+ UnCompiledNode <T > other = (UnCompiledNode <T >) obj ;
961+
962+ if (numArcs != other .numArcs || isFinal != other .isFinal ) {
963+ return false ;
964+ }
965+
966+ for (int arcUpto = 0 ; arcUpto < numArcs ; arcUpto ++) {
967+ final FSTCompiler .Arc <T > arc = arcs [arcUpto ];
968+ final FSTCompiler .Arc <T > otherArc = other .arcs [arcUpto ];
969+ if (arc .label != otherArc .label
970+ || arc .output .equals (otherArc .output ) == false
971+ || ((FSTCompiler .CompiledNode ) arc .target ).node
972+ != ((FSTCompiler .CompiledNode ) otherArc .target ).node
973+ || arc .nextFinalOutput .equals (otherArc .nextFinalOutput ) == false
974+ || arc .isFinal != otherArc .isFinal ) {
975+ return false ;
976+ }
977+ }
978+
979+ return true ;
980+ }
981+
982+ @ Override
983+ public String toString () {
984+ String arcString = "" ;
985+ for (int arcIndex = 0 ; arcIndex < numArcs ; arcIndex ++) {
986+ Arc <T > arc = arcs [arcIndex ];
987+ arcString +=
988+ "{label="
989+ + arc .label
990+ + ", output="
991+ + arc .output
992+ + ", isFinal="
993+ + arc .isFinal
994+ + ", nextFinalOutput="
995+ + arc .nextFinalOutput
996+ + ", node="
997+ + (arc .target != null ? ((CompiledNode ) arc .target ).node : "" )
998+ + "}," ;
999+ }
1000+ return "{numArcs=" + numArcs + ", isFinal=" + isFinal + ", arcs=[" + arcString + "]}" ;
1001+ }
1002+
9341003 void clear () {
9351004 numArcs = 0 ;
9361005 isFinal = false ;
9371006 output = owner .NO_OUTPUT ;
938- inputCount = 0 ;
9391007
9401008 // We don't clear the depth here because it never changes
9411009 // for nodes on the frontier (even when reused).
@@ -1009,6 +1077,31 @@ void prependOutput(T outputPrefix) {
10091077 assert owner .validOutput (output );
10101078 }
10111079 }
1080+
1081+ @ Override
1082+ public long ramBytesUsed () {
1083+ return BASE_RAM_BYTES_USED + BASE_ARC_RAM_BYTES_USED * numArcs ;
1084+ }
1085+
1086+ @ SuppressWarnings ({"rawtypes" , "unchecked" })
1087+ UnCompiledNode <T > copyToNewNode () {
1088+ FSTCompiler .UnCompiledNode <T > cloned = new FSTCompiler .UnCompiledNode <>(owner , depth );
1089+ cloned .numArcs = numArcs ;
1090+ cloned .output = output ;
1091+ cloned .isFinal = isFinal ;
1092+
1093+ cloned .arcs = new FSTCompiler .Arc [numArcs ];
1094+ for (int arcIndex = 0 ; arcIndex < numArcs ; arcIndex ++) {
1095+ FSTCompiler .Arc <T > nodeArc = arcs [arcIndex ];
1096+ cloned .arcs [arcIndex ] = new FSTCompiler .Arc <>();
1097+ cloned .arcs [arcIndex ].target = nodeArc .target ;
1098+ cloned .arcs [arcIndex ].label = nodeArc .label ;
1099+ cloned .arcs [arcIndex ].output = nodeArc .output ;
1100+ cloned .arcs [arcIndex ].isFinal = nodeArc .isFinal ;
1101+ cloned .arcs [arcIndex ].nextFinalOutput = nodeArc .nextFinalOutput ;
1102+ }
1103+ return cloned ;
1104+ }
10121105 }
10131106
10141107 /**
0 commit comments