Skip to content

Commit e178511

Browse files
committed
change to package-private
1 parent 12fc7bf commit e178511

File tree

4 files changed

+171
-44
lines changed

4 files changed

+171
-44
lines changed

lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java

Lines changed: 118 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@
3131
import java.io.IOException;
3232
import org.apache.lucene.store.ByteArrayDataOutput;
3333
import org.apache.lucene.store.DataOutput;
34+
import org.apache.lucene.util.Accountable;
3435
import org.apache.lucene.util.ArrayUtil;
3536
import org.apache.lucene.util.IntsRef;
3637
import org.apache.lucene.util.IntsRefBuilder;
38+
import org.apache.lucene.util.RamUsageEstimator;
3739
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
3840

3941
// TODO: could we somehow stream an FST to disk while we
@@ -83,7 +85,7 @@ public class FSTCompiler<T> {
8385
*/
8486
private static final float DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR = 1.66f;
8587

86-
private final NodeHash<T> dedupHash;
88+
final NodeHash<T> dedupHash;
8789
final FST<T> fst;
8890
private final T NO_OUTPUT;
8991

@@ -145,7 +147,7 @@ private FSTCompiler(
145147
if (suffixRAMLimitMB < 0) {
146148
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
147149
} else if (suffixRAMLimitMB > 0) {
148-
dedupHash = new NodeHash<>(this, suffixRAMLimitMB, bytes.getReverseReader(false));
150+
dedupHash = new NodeHash<>(this, suffixRAMLimitMB);
149151
} else {
150152
dedupHash = null;
151153
}
@@ -270,10 +272,6 @@ public float getDirectAddressingMaxOversizingFactor() {
270272
return directAddressingMaxOversizingFactor;
271273
}
272274

273-
public long getTermCount() {
274-
return frontier[0].inputCount;
275-
}
276-
277275
public long getNodeCount() {
278276
// 1+ in order to count the -1 implicit final node
279277
return 1 + nodeCount;
@@ -292,13 +290,13 @@ private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throw
292290
long bytesPosStart = bytes.getPosition();
293291
if (dedupHash != null) {
294292
if (nodeIn.numArcs == 0) {
295-
node = addNode(nodeIn);
293+
node = addNode(nodeIn).nodeAddress;
296294
lastFrozenNode = node;
297295
} else {
298296
node = dedupHash.add(nodeIn);
299297
}
300298
} else {
301-
node = addNode(nodeIn);
299+
node = addNode(nodeIn).nodeAddress;
302300
}
303301
assert node != -2;
304302

@@ -318,13 +316,13 @@ private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throw
318316

319317
// serializes new node by appending its bytes to the end
320318
// of the current byte[]
321-
long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
319+
NodeAndBuffer addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
322320
// System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
323321
if (nodeIn.numArcs == 0) {
324322
if (nodeIn.isFinal) {
325-
return FINAL_END_NODE;
323+
return new NodeAndBuffer(FINAL_END_NODE, null);
326324
} else {
327-
return NON_FINAL_END_NODE;
325+
return new NodeAndBuffer(NON_FINAL_END_NODE, null);
328326
}
329327
}
330328
final long startAddress = bytes.getPosition();
@@ -461,7 +459,13 @@ long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
461459
final long thisNodeAddress = bytes.getPosition() - 1;
462460
bytes.reverse(startAddress, thisNodeAddress);
463461
nodeCount++;
464-
return thisNodeAddress;
462+
byte[] buf = new byte[Math.toIntExact(thisNodeAddress - startAddress + 1)];
463+
bytes.copyBytes(startAddress, buf, 0, buf.length);
464+
return new NodeAndBuffer(thisNodeAddress, buf);
465+
}
466+
467+
record NodeAndBuffer(long nodeAddress, byte[] bytes) {
468+
465469
}
466470

467471
private void writeLabel(DataOutput out, int v) throws IOException {
@@ -749,7 +753,6 @@ public void add(IntsRef input, T output) throws IOException {
749753
// format cannot represent the empty input since
750754
// 'finalness' is stored on the incoming arc, not on
751755
// the node
752-
frontier[0].inputCount++;
753756
frontier[0].isFinal = true;
754757
fst.setEmptyOutput(output);
755758
return;
@@ -760,9 +763,6 @@ public void add(IntsRef input, T output) throws IOException {
760763
int pos2 = input.offset;
761764
final int pos1Stop = Math.min(lastInput.length(), input.length);
762765
while (true) {
763-
frontier[pos1].inputCount++;
764-
// System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" +
765-
// frontier[pos1]);
766766
if (pos1 >= pos1Stop || lastInput.intAt(pos1) != input.ints[pos2]) {
767767
break;
768768
}
@@ -786,7 +786,6 @@ public void add(IntsRef input, T output) throws IOException {
786786
// init tail states for current input
787787
for (int idx = prefixLenPlus1; idx <= input.length; idx++) {
788788
frontier[idx - 1].addArc(input.ints[input.offset + idx - 1], frontier[idx]);
789-
frontier[idx].inputCount++;
790789
}
791790

792791
final UnCompiledNode<T> lastNode = frontier[input.length];
@@ -835,8 +834,6 @@ public void add(IntsRef input, T output) throws IOException {
835834

836835
// save last input
837836
lastInput.copyInts(input);
838-
839-
// System.out.println(" count[0]=" + frontier[0].inputCount);
840837
}
841838

842839
private boolean validOutput(T output) {
@@ -866,6 +863,7 @@ public FST<T> compile() throws IOException {
866863

867864
/** Expert: holds a pending (seen but not yet serialized) arc. */
868865
static class Arc<T> {
866+
869867
int label; // really an "unsigned" byte
870868
Node target;
871869
boolean isFinal;
@@ -895,7 +893,13 @@ public boolean isCompiled() {
895893
}
896894

897895
/** Expert: holds a pending (seen but not yet serialized) Node. */
898-
static final class UnCompiledNode<T> implements Node {
896+
static final class UnCompiledNode<T> implements Node, Accountable {
897+
898+
private static final long BASE_RAM_BYTES_USED =
899+
RamUsageEstimator.shallowSizeOfInstance(UnCompiledNode.class);
900+
private static final long BASE_ARC_RAM_BYTES_USED =
901+
RamUsageEstimator.shallowSizeOfInstance(Arc.class);
902+
899903
final FSTCompiler<T> owner;
900904
int numArcs;
901905
Arc<T>[] arcs;
@@ -906,10 +910,6 @@ static final class UnCompiledNode<T> implements Node {
906910
T output;
907911
boolean isFinal;
908912

909-
// TODO: remove this tracking? we used to use it for confusingly pruning NodeHash, but
910-
// we switched to LRU by RAM usage instead:
911-
long inputCount;
912-
913913
/** This node's depth, starting from the automaton root. */
914914
final int depth;
915915

@@ -931,11 +931,79 @@ public boolean isCompiled() {
931931
return false;
932932
}
933933

934+
@Override
935+
public int hashCode() {
936+
final int PRIME = 31;
937+
long h = 0;
938+
// TODO: maybe if number of arcs is high we can safely subsample?
939+
for (int arcIdx = 0; arcIdx < numArcs; arcIdx++) {
940+
final FSTCompiler.Arc<T> arc = arcs[arcIdx];
941+
h = PRIME * h + arc.label;
942+
long n = ((FSTCompiler.CompiledNode) arc.target).node;
943+
h = PRIME * h + (int) (n ^ (n >> 32));
944+
h = PRIME * h + arc.output.hashCode();
945+
h = PRIME * h + arc.nextFinalOutput.hashCode();
946+
if (arc.isFinal) {
947+
h += 17;
948+
}
949+
}
950+
951+
return Long.hashCode(h);
952+
}
953+
954+
@Override
955+
public boolean equals(Object obj) {
956+
if ((obj instanceof FSTCompiler.UnCompiledNode) == false) {
957+
return false;
958+
}
959+
@SuppressWarnings("unchecked")
960+
UnCompiledNode<T> other = (UnCompiledNode<T>) obj;
961+
962+
if (numArcs != other.numArcs || isFinal != other.isFinal) {
963+
return false;
964+
}
965+
966+
for (int arcUpto = 0; arcUpto < numArcs; arcUpto++) {
967+
final FSTCompiler.Arc<T> arc = arcs[arcUpto];
968+
final FSTCompiler.Arc<T> otherArc = other.arcs[arcUpto];
969+
if (arc.label != otherArc.label
970+
|| arc.output.equals(otherArc.output) == false
971+
|| ((FSTCompiler.CompiledNode) arc.target).node
972+
!= ((FSTCompiler.CompiledNode) otherArc.target).node
973+
|| arc.nextFinalOutput.equals(otherArc.nextFinalOutput) == false
974+
|| arc.isFinal != otherArc.isFinal) {
975+
return false;
976+
}
977+
}
978+
979+
return true;
980+
}
981+
982+
@Override
983+
public String toString() {
984+
String arcString = "";
985+
for (int arcIndex = 0; arcIndex < numArcs; arcIndex++) {
986+
Arc<T> arc = arcs[arcIndex];
987+
arcString +=
988+
"{label="
989+
+ arc.label
990+
+ ", output="
991+
+ arc.output
992+
+ ", isFinal="
993+
+ arc.isFinal
994+
+ ", nextFinalOutput="
995+
+ arc.nextFinalOutput
996+
+ ", node="
997+
+ (arc.target != null ? ((CompiledNode) arc.target).node : "")
998+
+ "},";
999+
}
1000+
return "{numArcs=" + numArcs + ", isFinal=" + isFinal + ", arcs=[" + arcString + "]}";
1001+
}
1002+
9341003
void clear() {
9351004
numArcs = 0;
9361005
isFinal = false;
9371006
output = owner.NO_OUTPUT;
938-
inputCount = 0;
9391007

9401008
// We don't clear the depth here because it never changes
9411009
// for nodes on the frontier (even when reused).
@@ -1009,6 +1077,31 @@ void prependOutput(T outputPrefix) {
10091077
assert owner.validOutput(output);
10101078
}
10111079
}
1080+
1081+
@Override
1082+
public long ramBytesUsed() {
1083+
return BASE_RAM_BYTES_USED + BASE_ARC_RAM_BYTES_USED * numArcs;
1084+
}
1085+
1086+
@SuppressWarnings({"rawtypes", "unchecked"})
1087+
UnCompiledNode<T> copyToNewNode() {
1088+
FSTCompiler.UnCompiledNode<T> cloned = new FSTCompiler.UnCompiledNode<>(owner, depth);
1089+
cloned.numArcs = numArcs;
1090+
cloned.output = output;
1091+
cloned.isFinal = isFinal;
1092+
1093+
cloned.arcs = new FSTCompiler.Arc[numArcs];
1094+
for (int arcIndex = 0; arcIndex < numArcs; arcIndex++) {
1095+
FSTCompiler.Arc<T> nodeArc = arcs[arcIndex];
1096+
cloned.arcs[arcIndex] = new FSTCompiler.Arc<>();
1097+
cloned.arcs[arcIndex].target = nodeArc.target;
1098+
cloned.arcs[arcIndex].label = nodeArc.label;
1099+
cloned.arcs[arcIndex].output = nodeArc.output;
1100+
cloned.arcs[arcIndex].isFinal = nodeArc.isFinal;
1101+
cloned.arcs[arcIndex].nextFinalOutput = nodeArc.nextFinalOutput;
1102+
}
1103+
return cloned;
1104+
}
10121105
}
10131106

10141107
/**

0 commit comments

Comments
 (0)