Skip to content

Commit bb479c7

Browse files
dungba88mikemccand
authored andcommitted
Streamline FST constructors and make it fully read-only (#12758)
* Streamline FST constructors * Let init return the FSTStore * Change constructor visibility
1 parent 9cc2eae commit bb479c7

File tree

7 files changed

+139
-76
lines changed

7 files changed

+139
-76
lines changed

lucene/core/src/java/org/apache/lucene/util/fst/FST.java

Lines changed: 96 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@
6161
*/
6262
public final class FST<T> implements Accountable {
6363

64+
final FSTMetadata<T> metadata;
65+
6466
/** Specifies allowed range of each int input label for this FST. */
6567
public enum INPUT_TYPE {
6668
BYTE1,
@@ -100,7 +102,7 @@ public enum INPUT_TYPE {
100102
private static final String FILE_FORMAT_NAME = "FST";
101103
private static final int VERSION_START = 6;
102104
private static final int VERSION_LITTLE_ENDIAN = 8;
103-
private static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN;
105+
static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN;
104106

105107
// Never serialized; just used to represent the virtual
106108
// final node w/ no arcs:
@@ -113,24 +115,14 @@ public enum INPUT_TYPE {
113115
/** If arc has this label then that arc is final/accepted */
114116
public static final int END_LABEL = -1;
115117

116-
final INPUT_TYPE inputType;
117-
118-
// if non-null, this FST accepts the empty string and
119-
// produces this output
120-
T emptyOutput;
121-
122118
/**
123119
* A {@link BytesStore}, used during building, or during reading when the FST is very large (more
124120
* than 1 GB). If the FST is less than 1 GB then bytesArray is set instead.
125121
*/
126122
private final FSTReader fstReader;
127123

128-
private long startNode = -1;
129-
130124
public final Outputs<T> outputs;
131125

132-
private final int version;
133-
134126
/** Represents a single arc. */
135127
public static final class Arc<T> {
136128

@@ -395,33 +387,58 @@ private static boolean flag(int flags, int bit) {
395387
return (flags & bit) != 0;
396388
}
397389

398-
// make a new empty FST, for building; Builder invokes this
399-
FST(INPUT_TYPE inputType, Outputs<T> outputs, FSTReader fstReader) {
400-
this.inputType = inputType;
401-
this.outputs = outputs;
402-
emptyOutput = null;
403-
this.fstReader = fstReader;
404-
this.version = VERSION_CURRENT;
405-
}
406-
407390
private static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
408391

409-
/** Load a previously saved FST. */
392+
/**
393+
* Load a previously saved FST with a DataInput for metdata using an {@link OnHeapFSTStore} with
394+
* maxBlockBits set to {@link #DEFAULT_MAX_BLOCK_BITS}
395+
*/
410396
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs) throws IOException {
411397
this(metaIn, in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
412398
}
413399

414400
/**
415-
* Load a previously saved FST; maxBlockBits allows you to control the size of the byte[] pages
416-
* used to hold the FST bytes.
401+
* Load a previously saved FST with a DataInput for metdata and a FSTStore. If using {@link
402+
* OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used
403+
* to hold the FST bytes.
417404
*/
418405
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore)
419406
throws IOException {
407+
this(readMetadata(metaIn, outputs), in, outputs, fstStore);
408+
}
409+
410+
/**
411+
* Load a previously saved FST with a metdata object and a FSTStore. If using {@link
412+
* OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used
413+
* to hold the FST bytes.
414+
*/
415+
public FST(FSTMetadata<T> metadata, DataInput in, Outputs<T> outputs, FSTStore fstStore)
416+
throws IOException {
417+
this(metadata, outputs, fstStore.init(in, metadata.numBytes));
418+
}
419+
420+
/** Create the FST with a metadata object and a FSTReader. */
421+
FST(FSTMetadata<T> metadata, Outputs<T> outputs, FSTReader fstReader) {
422+
this.metadata = metadata;
420423
this.outputs = outputs;
424+
this.fstReader = fstReader;
425+
}
421426

427+
/**
428+
* Read the FST metadata from DataInput
429+
*
430+
* @param metaIn the DataInput of the metadata
431+
* @param outputs the FST outputs
432+
* @return the FST metadata
433+
* @param <T> the output type
434+
* @throws IOException if exception occurred during parsing
435+
*/
436+
public static <T> FSTMetadata<T> readMetadata(DataInput metaIn, Outputs<T> outputs)
437+
throws IOException {
422438
// NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have
423439
// back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it
424-
this.version = CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
440+
int version = CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
441+
T emptyOutput;
425442
if (metaIn.readByte() == 1) {
426443
// accepts empty string
427444
// 1 KB blocks:
@@ -441,6 +458,7 @@ public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore
441458
} else {
442459
emptyOutput = null;
443460
}
461+
INPUT_TYPE inputType;
444462
final byte t = metaIn.readByte();
445463
switch (t) {
446464
case 0:
@@ -453,13 +471,11 @@ public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore
453471
inputType = INPUT_TYPE.BYTE4;
454472
break;
455473
default:
456-
throw new CorruptIndexException("invalid input type " + t, in);
474+
throw new CorruptIndexException("invalid input type " + t, metaIn);
457475
}
458-
startNode = metaIn.readVLong();
459-
476+
long startNode = metaIn.readVLong();
460477
long numBytes = metaIn.readVLong();
461-
fstStore.init(in, numBytes);
462-
this.fstReader = fstStore;
478+
return new FSTMetadata<>(inputType, emptyOutput, startNode, version, numBytes);
463479
}
464480

465481
@Override
@@ -469,50 +485,42 @@ public long ramBytesUsed() {
469485

470486
@Override
471487
public String toString() {
472-
return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs;
473-
}
474-
475-
void finish(long newStartNode) throws IOException {
476-
assert newStartNode <= fstReader.size();
477-
if (startNode != -1) {
478-
throw new IllegalStateException("already finished");
479-
}
480-
if (newStartNode == FINAL_END_NODE && emptyOutput != null) {
481-
newStartNode = 0;
482-
}
483-
startNode = newStartNode;
488+
return getClass().getSimpleName() + "(input=" + metadata.inputType + ",output=" + outputs;
484489
}
485490

486491
public long numBytes() {
487-
return fstReader.size();
492+
return metadata.numBytes;
488493
}
489494

490495
public T getEmptyOutput() {
491-
return emptyOutput;
496+
return metadata.emptyOutput;
492497
}
493498

494-
void setEmptyOutput(T v) {
495-
if (emptyOutput != null) {
496-
emptyOutput = outputs.merge(emptyOutput, v);
497-
} else {
498-
emptyOutput = v;
499-
}
499+
public FSTMetadata<T> getMetadata() {
500+
return metadata;
500501
}
501502

502503
public void save(DataOutput metaOut, DataOutput out) throws IOException {
503-
if (startNode == -1) {
504-
throw new IllegalStateException("call finish first");
505-
}
504+
saveMetadata(metaOut);
505+
fstReader.writeTo(out);
506+
}
507+
508+
/**
509+
* Save the metadata to a DataOutput
510+
*
511+
* @param metaOut the DataOutput to save
512+
*/
513+
public void saveMetadata(DataOutput metaOut) throws IOException {
506514
CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT);
507515
// TODO: really we should encode this as an arc, arriving
508516
// to the root node, instead of special casing here:
509-
if (emptyOutput != null) {
517+
if (metadata.emptyOutput != null) {
510518
// Accepts empty string
511519
metaOut.writeByte((byte) 1);
512520

513521
// Serialize empty-string output:
514522
ByteBuffersDataOutput ros = new ByteBuffersDataOutput();
515-
outputs.writeFinalOutput(emptyOutput, ros);
523+
outputs.writeFinalOutput(metadata.emptyOutput, ros);
516524
byte[] emptyOutputBytes = ros.toArrayCopy();
517525
int emptyLen = emptyOutputBytes.length;
518526

@@ -531,17 +539,16 @@ public void save(DataOutput metaOut, DataOutput out) throws IOException {
531539
metaOut.writeByte((byte) 0);
532540
}
533541
final byte t;
534-
if (inputType == INPUT_TYPE.BYTE1) {
542+
if (metadata.inputType == INPUT_TYPE.BYTE1) {
535543
t = 0;
536-
} else if (inputType == INPUT_TYPE.BYTE2) {
544+
} else if (metadata.inputType == INPUT_TYPE.BYTE2) {
537545
t = 1;
538546
} else {
539547
t = 2;
540548
}
541549
metaOut.writeByte(t);
542-
metaOut.writeVLong(startNode);
550+
metaOut.writeVLong(metadata.startNode);
543551
metaOut.writeVLong(numBytes());
544-
fstReader.writeTo(out);
545552
}
546553

547554
/** Writes an automaton to a file. */
@@ -563,12 +570,12 @@ public static <T> FST<T> read(Path path, Outputs<T> outputs) throws IOException
563570
/** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */
564571
public int readLabel(DataInput in) throws IOException {
565572
final int v;
566-
if (inputType == INPUT_TYPE.BYTE1) {
573+
if (metadata.inputType == INPUT_TYPE.BYTE1) {
567574
// Unsigned byte:
568575
v = in.readByte() & 0xFF;
569-
} else if (inputType == INPUT_TYPE.BYTE2) {
576+
} else if (metadata.inputType == INPUT_TYPE.BYTE2) {
570577
// Unsigned short:
571-
if (version < VERSION_LITTLE_ENDIAN) {
578+
if (metadata.version < VERSION_LITTLE_ENDIAN) {
572579
v = Short.reverseBytes(in.readShort()) & 0xFFFF;
573580
} else {
574581
v = in.readShort() & 0xFFFF;
@@ -608,10 +615,10 @@ private void readPresenceBytes(Arc<T> arc, BytesReader in) throws IOException {
608615
public Arc<T> getFirstArc(Arc<T> arc) {
609616
T NO_OUTPUT = outputs.getNoOutput();
610617

611-
if (emptyOutput != null) {
618+
if (metadata.emptyOutput != null) {
612619
arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC;
613-
arc.nextFinalOutput = emptyOutput;
614-
if (emptyOutput != NO_OUTPUT) {
620+
arc.nextFinalOutput = metadata.emptyOutput;
621+
if (metadata.emptyOutput != NO_OUTPUT) {
615622
arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT);
616623
}
617624
} else {
@@ -622,7 +629,7 @@ public Arc<T> getFirstArc(Arc<T> arc) {
622629

623630
// If there are no nodes, ie, the FST only accepts the
624631
// empty string, then startNode is 0
625-
arc.target = startNode;
632+
arc.target = metadata.startNode;
626633
return arc;
627634
}
628635

@@ -1132,4 +1139,28 @@ public abstract static class BytesReader extends DataInput {
11321139
/** Returns true if this reader uses reversed bytes under-the-hood. */
11331140
public abstract boolean reversed();
11341141
}
1142+
1143+
/**
1144+
* Represent the FST metadata
1145+
*
1146+
* @param <T> the FST output type
1147+
*/
1148+
public static final class FSTMetadata<T> {
1149+
final INPUT_TYPE inputType;
1150+
final int version;
1151+
// if non-null, this FST accepts the empty string and
1152+
// produces this output
1153+
T emptyOutput;
1154+
long startNode;
1155+
long numBytes;
1156+
1157+
public FSTMetadata(
1158+
INPUT_TYPE inputType, T emptyOutput, long startNode, int version, long numBytes) {
1159+
this.inputType = inputType;
1160+
this.emptyOutput = emptyOutput;
1161+
this.startNode = startNode;
1162+
this.version = version;
1163+
this.numBytes = numBytes;
1164+
}
1165+
}
11351166
}

lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import static org.apache.lucene.util.fst.FST.BIT_TARGET_NEXT;
2727
import static org.apache.lucene.util.fst.FST.FINAL_END_NODE;
2828
import static org.apache.lucene.util.fst.FST.NON_FINAL_END_NODE;
29+
import static org.apache.lucene.util.fst.FST.VERSION_CURRENT;
2930
import static org.apache.lucene.util.fst.FST.getNumPresenceBytes;
3031

3132
import java.io.IOException;
@@ -141,7 +142,7 @@ private FSTCompiler(
141142
// pad: ensure no node gets address 0 which is reserved to mean
142143
// the stop state w/ no arcs
143144
bytes.writeByte((byte) 0);
144-
fst = new FST<>(inputType, outputs, bytes);
145+
fst = new FST<>(new FST.FSTMetadata<>(inputType, null, -1, VERSION_CURRENT, 0), outputs, bytes);
145146
if (suffixRAMLimitMB < 0) {
146147
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
147148
} else if (suffixRAMLimitMB > 0) {
@@ -462,10 +463,10 @@ long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
462463

463464
private void writeLabel(DataOutput out, int v) throws IOException {
464465
assert v >= 0 : "v=" + v;
465-
if (fst.inputType == INPUT_TYPE.BYTE1) {
466+
if (fst.metadata.inputType == INPUT_TYPE.BYTE1) {
466467
assert v <= 255 : "v=" + v;
467468
out.writeByte((byte) v);
468-
} else if (fst.inputType == INPUT_TYPE.BYTE2) {
469+
} else if (fst.metadata.inputType == INPUT_TYPE.BYTE2) {
469470
assert v <= 65535 : "v=" + v;
470471
out.writeShort((short) v);
471472
} else {
@@ -746,7 +747,7 @@ public void add(IntsRef input, T output) throws IOException {
746747
// 'finalness' is stored on the incoming arc, not on
747748
// the node
748749
frontier[0].isFinal = true;
749-
fst.setEmptyOutput(output);
750+
setEmptyOutput(output);
750751
return;
751752
}
752753

@@ -828,6 +829,26 @@ public void add(IntsRef input, T output) throws IOException {
828829
lastInput.copyInts(input);
829830
}
830831

832+
void setEmptyOutput(T v) {
833+
if (fst.metadata.emptyOutput != null) {
834+
fst.metadata.emptyOutput = fst.outputs.merge(fst.metadata.emptyOutput, v);
835+
} else {
836+
fst.metadata.emptyOutput = v;
837+
}
838+
}
839+
840+
void finish(long newStartNode) {
841+
assert newStartNode <= bytes.size();
842+
if (fst.metadata.startNode != -1) {
843+
throw new IllegalStateException("already finished");
844+
}
845+
if (newStartNode == FINAL_END_NODE && fst.metadata.emptyOutput != null) {
846+
newStartNode = 0;
847+
}
848+
fst.metadata.startNode = newStartNode;
849+
fst.metadata.numBytes = bytes.getPosition();
850+
}
851+
831852
private boolean validOutput(T output) {
832853
return output == NO_OUTPUT || !output.equals(NO_OUTPUT);
833854
}
@@ -840,14 +861,14 @@ public FST<T> compile() throws IOException {
840861
// minimize nodes in the last word's suffix
841862
freezeTail(0);
842863
if (root.numArcs == 0) {
843-
if (fst.emptyOutput == null) {
864+
if (fst.metadata.emptyOutput == null) {
844865
return null;
845866
}
846867
}
847868

848869
// if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + "
849870
// root.output=" + root.output);
850-
fst.finish(compileNode(root, lastInput.length()).node);
871+
finish(compileNode(root, lastInput.length()).node);
851872
bytes.finish();
852873

853874
return fst;

lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,14 @@
2121

2222
/** A type of {@link FSTReader} which needs data to be initialized before use */
2323
public interface FSTStore extends FSTReader {
24-
void init(DataInput in, long numBytes) throws IOException;
24+
25+
/**
26+
* Initialize the FSTStore
27+
*
28+
* @param in the DataInput to read from
29+
* @param numBytes the number of bytes to read
30+
* @return this FSTStore
31+
* @throws IOException if exception occurred during reading the DataInput
32+
*/
33+
FSTStore init(DataInput in, long numBytes) throws IOException;
2534
}

0 commit comments

Comments
 (0)