6161 */
6262public final class FST <T > implements Accountable {
6363
64+ final FSTMetadata <T > metadata ;
65+
6466 /** Specifies allowed range of each int input label for this FST. */
6567 public enum INPUT_TYPE {
6668 BYTE1 ,
@@ -100,7 +102,7 @@ public enum INPUT_TYPE {
100102 private static final String FILE_FORMAT_NAME = "FST" ;
101103 private static final int VERSION_START = 6 ;
102104 private static final int VERSION_LITTLE_ENDIAN = 8 ;
103- private static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN ;
105+ static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN ;
104106
105107 // Never serialized; just used to represent the virtual
106108 // final node w/ no arcs:
@@ -113,24 +115,14 @@ public enum INPUT_TYPE {
113115 /** If arc has this label then that arc is final/accepted */
114116 public static final int END_LABEL = -1 ;
115117
116- final INPUT_TYPE inputType ;
117-
118- // if non-null, this FST accepts the empty string and
119- // produces this output
120- T emptyOutput ;
121-
122118 /**
123119 * A {@link BytesStore}, used during building, or during reading when the FST is very large (more
124120 * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead.
125121 */
126122 private final FSTReader fstReader ;
127123
128- private long startNode = -1 ;
129-
130124 public final Outputs <T > outputs ;
131125
132- private final int version ;
133-
134126 /** Represents a single arc. */
135127 public static final class Arc <T > {
136128
@@ -395,33 +387,58 @@ private static boolean flag(int flags, int bit) {
395387 return (flags & bit ) != 0 ;
396388 }
397389
398- // make a new empty FST, for building; Builder invokes this
399- FST (INPUT_TYPE inputType , Outputs <T > outputs , FSTReader fstReader ) {
400- this .inputType = inputType ;
401- this .outputs = outputs ;
402- emptyOutput = null ;
403- this .fstReader = fstReader ;
404- this .version = VERSION_CURRENT ;
405- }
406-
407390 private static final int DEFAULT_MAX_BLOCK_BITS = Constants .JRE_IS_64BIT ? 30 : 28 ;
408391
409- /** Load a previously saved FST. */
392+ /**
393+ * Load a previously saved FST with a DataInput for metdata using an {@link OnHeapFSTStore} with
394+ * maxBlockBits set to {@link #DEFAULT_MAX_BLOCK_BITS}
395+ */
410396 public FST (DataInput metaIn , DataInput in , Outputs <T > outputs ) throws IOException {
411397 this (metaIn , in , outputs , new OnHeapFSTStore (DEFAULT_MAX_BLOCK_BITS ));
412398 }
413399
414400 /**
415- * Load a previously saved FST; maxBlockBits allows you to control the size of the byte[] pages
416- * used to hold the FST bytes.
401+ * Load a previously saved FST with a DataInput for metdata and a FSTStore. If using {@link
402+ * OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used
403+ * to hold the FST bytes.
417404 */
418405 public FST (DataInput metaIn , DataInput in , Outputs <T > outputs , FSTStore fstStore )
419406 throws IOException {
407+ this (readMetadata (metaIn , outputs ), in , outputs , fstStore );
408+ }
409+
410+ /**
411+ * Load a previously saved FST with a metdata object and a FSTStore. If using {@link
412+ * OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used
413+ * to hold the FST bytes.
414+ */
415+ public FST (FSTMetadata <T > metadata , DataInput in , Outputs <T > outputs , FSTStore fstStore )
416+ throws IOException {
417+ this (metadata , outputs , fstStore .init (in , metadata .numBytes ));
418+ }
419+
420+ /** Create the FST with a metadata object and a FSTReader. */
421+ FST (FSTMetadata <T > metadata , Outputs <T > outputs , FSTReader fstReader ) {
422+ this .metadata = metadata ;
420423 this .outputs = outputs ;
424+ this .fstReader = fstReader ;
425+ }
421426
427+ /**
428+ * Read the FST metadata from DataInput
429+ *
430+ * @param metaIn the DataInput of the metadata
431+ * @param outputs the FST outputs
432+ * @return the FST metadata
433+ * @param <T> the output type
434+ * @throws IOException if exception occurred during parsing
435+ */
436+ public static <T > FSTMetadata <T > readMetadata (DataInput metaIn , Outputs <T > outputs )
437+ throws IOException {
422438 // NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have
423439 // back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it
424- this .version = CodecUtil .checkHeader (metaIn , FILE_FORMAT_NAME , VERSION_START , VERSION_CURRENT );
440+ int version = CodecUtil .checkHeader (metaIn , FILE_FORMAT_NAME , VERSION_START , VERSION_CURRENT );
441+ T emptyOutput ;
425442 if (metaIn .readByte () == 1 ) {
426443 // accepts empty string
427444 // 1 KB blocks:
@@ -441,6 +458,7 @@ public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore
441458 } else {
442459 emptyOutput = null ;
443460 }
461+ INPUT_TYPE inputType ;
444462 final byte t = metaIn .readByte ();
445463 switch (t ) {
446464 case 0 :
@@ -453,13 +471,11 @@ public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore
453471 inputType = INPUT_TYPE .BYTE4 ;
454472 break ;
455473 default :
456- throw new CorruptIndexException ("invalid input type " + t , in );
474+ throw new CorruptIndexException ("invalid input type " + t , metaIn );
457475 }
458- startNode = metaIn .readVLong ();
459-
476+ long startNode = metaIn .readVLong ();
460477 long numBytes = metaIn .readVLong ();
461- fstStore .init (in , numBytes );
462- this .fstReader = fstStore ;
478+ return new FSTMetadata <>(inputType , emptyOutput , startNode , version , numBytes );
463479 }
464480
465481 @ Override
@@ -469,50 +485,42 @@ public long ramBytesUsed() {
469485
470486 @ Override
471487 public String toString () {
472- return getClass ().getSimpleName () + "(input=" + inputType + ",output=" + outputs ;
473- }
474-
475- void finish (long newStartNode ) throws IOException {
476- assert newStartNode <= fstReader .size ();
477- if (startNode != -1 ) {
478- throw new IllegalStateException ("already finished" );
479- }
480- if (newStartNode == FINAL_END_NODE && emptyOutput != null ) {
481- newStartNode = 0 ;
482- }
483- startNode = newStartNode ;
488+ return getClass ().getSimpleName () + "(input=" + metadata .inputType + ",output=" + outputs ;
484489 }
485490
486491 public long numBytes () {
487- return fstReader . size () ;
492+ return metadata . numBytes ;
488493 }
489494
490495 public T getEmptyOutput () {
491- return emptyOutput ;
496+ return metadata . emptyOutput ;
492497 }
493498
494- void setEmptyOutput (T v ) {
495- if (emptyOutput != null ) {
496- emptyOutput = outputs .merge (emptyOutput , v );
497- } else {
498- emptyOutput = v ;
499- }
499+ public FSTMetadata <T > getMetadata () {
500+ return metadata ;
500501 }
501502
502503 public void save (DataOutput metaOut , DataOutput out ) throws IOException {
503- if (startNode == -1 ) {
504- throw new IllegalStateException ("call finish first" );
505- }
504+ saveMetadata (metaOut );
505+ fstReader .writeTo (out );
506+ }
507+
508+ /**
509+ * Save the metadata to a DataOutput
510+ *
511+ * @param metaOut the DataOutput to save
512+ */
513+ public void saveMetadata (DataOutput metaOut ) throws IOException {
506514 CodecUtil .writeHeader (metaOut , FILE_FORMAT_NAME , VERSION_CURRENT );
507515 // TODO: really we should encode this as an arc, arriving
508516 // to the root node, instead of special casing here:
509- if (emptyOutput != null ) {
517+ if (metadata . emptyOutput != null ) {
510518 // Accepts empty string
511519 metaOut .writeByte ((byte ) 1 );
512520
513521 // Serialize empty-string output:
514522 ByteBuffersDataOutput ros = new ByteBuffersDataOutput ();
515- outputs .writeFinalOutput (emptyOutput , ros );
523+ outputs .writeFinalOutput (metadata . emptyOutput , ros );
516524 byte [] emptyOutputBytes = ros .toArrayCopy ();
517525 int emptyLen = emptyOutputBytes .length ;
518526
@@ -531,17 +539,16 @@ public void save(DataOutput metaOut, DataOutput out) throws IOException {
531539 metaOut .writeByte ((byte ) 0 );
532540 }
533541 final byte t ;
534- if (inputType == INPUT_TYPE .BYTE1 ) {
542+ if (metadata . inputType == INPUT_TYPE .BYTE1 ) {
535543 t = 0 ;
536- } else if (inputType == INPUT_TYPE .BYTE2 ) {
544+ } else if (metadata . inputType == INPUT_TYPE .BYTE2 ) {
537545 t = 1 ;
538546 } else {
539547 t = 2 ;
540548 }
541549 metaOut .writeByte (t );
542- metaOut .writeVLong (startNode );
550+ metaOut .writeVLong (metadata . startNode );
543551 metaOut .writeVLong (numBytes ());
544- fstReader .writeTo (out );
545552 }
546553
547554 /** Writes an automaton to a file. */
@@ -563,12 +570,12 @@ public static <T> FST<T> read(Path path, Outputs<T> outputs) throws IOException
563570 /** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */
564571 public int readLabel (DataInput in ) throws IOException {
565572 final int v ;
566- if (inputType == INPUT_TYPE .BYTE1 ) {
573+ if (metadata . inputType == INPUT_TYPE .BYTE1 ) {
567574 // Unsigned byte:
568575 v = in .readByte () & 0xFF ;
569- } else if (inputType == INPUT_TYPE .BYTE2 ) {
576+ } else if (metadata . inputType == INPUT_TYPE .BYTE2 ) {
570577 // Unsigned short:
571- if (version < VERSION_LITTLE_ENDIAN ) {
578+ if (metadata . version < VERSION_LITTLE_ENDIAN ) {
572579 v = Short .reverseBytes (in .readShort ()) & 0xFFFF ;
573580 } else {
574581 v = in .readShort () & 0xFFFF ;
@@ -608,10 +615,10 @@ private void readPresenceBytes(Arc<T> arc, BytesReader in) throws IOException {
608615 public Arc <T > getFirstArc (Arc <T > arc ) {
609616 T NO_OUTPUT = outputs .getNoOutput ();
610617
611- if (emptyOutput != null ) {
618+ if (metadata . emptyOutput != null ) {
612619 arc .flags = BIT_FINAL_ARC | BIT_LAST_ARC ;
613- arc .nextFinalOutput = emptyOutput ;
614- if (emptyOutput != NO_OUTPUT ) {
620+ arc .nextFinalOutput = metadata . emptyOutput ;
621+ if (metadata . emptyOutput != NO_OUTPUT ) {
615622 arc .flags = (byte ) (arc .flags () | BIT_ARC_HAS_FINAL_OUTPUT );
616623 }
617624 } else {
@@ -622,7 +629,7 @@ public Arc<T> getFirstArc(Arc<T> arc) {
622629
623630 // If there are no nodes, ie, the FST only accepts the
624631 // empty string, then startNode is 0
625- arc .target = startNode ;
632+ arc .target = metadata . startNode ;
626633 return arc ;
627634 }
628635
@@ -1132,4 +1139,28 @@ public abstract static class BytesReader extends DataInput {
11321139 /** Returns true if this reader uses reversed bytes under-the-hood. */
11331140 public abstract boolean reversed ();
11341141 }
1142+
1143+ /**
1144+ * Represent the FST metadata
1145+ *
1146+ * @param <T> the FST output type
1147+ */
1148+ public static final class FSTMetadata <T > {
1149+ final INPUT_TYPE inputType ;
1150+ final int version ;
1151+ // if non-null, this FST accepts the empty string and
1152+ // produces this output
1153+ T emptyOutput ;
1154+ long startNode ;
1155+ long numBytes ;
1156+
1157+ public FSTMetadata (
1158+ INPUT_TYPE inputType , T emptyOutput , long startNode , int version , long numBytes ) {
1159+ this .inputType = inputType ;
1160+ this .emptyOutput = emptyOutput ;
1161+ this .startNode = startNode ;
1162+ this .version = version ;
1163+ this .numBytes = numBytes ;
1164+ }
1165+ }
11351166}
0 commit comments