Skip to content

Commit ebf9e29

Browse files
authored
Ensure Nori/Kuromoji shipped binary FST is the latest version (#12933)
* ensure Nori/Kuromoji shipped binary FST is the latest version (closes #12911) * fold feedback from @uschindler: sharpen test failure methods to give the specific gradlew command to regenerate the precise FST (not everything) * add javadoc for FSTMetadata.getVersion
1 parent 3965319 commit ebf9e29

File tree

5 files changed

+62
-3
lines changed

5 files changed

+62
-3
lines changed

lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ private TokenInfoDictionary(
111111
this.fst = new TokenInfoFST(fst, true);
112112
}
113113

114-
private static InputStream getClassResource(String suffix) throws IOException {
114+
static InputStream getClassResource(String suffix) throws IOException {
115115
final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
116116
return IOUtils.requireResourceNonNull(
117117
TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);

lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestTokenInfoDictionary.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,22 @@
2121
import static org.apache.lucene.analysis.morph.BinaryDictionary.POSDICT_FILENAME_SUFFIX;
2222
import static org.apache.lucene.analysis.morph.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX;
2323

24+
import java.io.BufferedInputStream;
25+
import java.io.InputStream;
2426
import java.io.OutputStream;
2527
import java.io.OutputStreamWriter;
2628
import java.io.PrintWriter;
2729
import java.nio.charset.StandardCharsets;
2830
import java.nio.file.Files;
2931
import java.nio.file.Path;
32+
import org.apache.lucene.store.InputStreamDataInput;
3033
import org.apache.lucene.tests.util.LuceneTestCase;
3134
import org.apache.lucene.util.IntsRef;
3235
import org.apache.lucene.util.IntsRefBuilder;
3336
import org.apache.lucene.util.UnicodeUtil;
3437
import org.apache.lucene.util.fst.FST;
3538
import org.apache.lucene.util.fst.IntsRefFSTEnum;
39+
import org.apache.lucene.util.fst.PositiveIntOutputs;
3640

3741
/** Tests of TokenInfoDictionary build tools; run using ant test-tools */
3842
public class TestTokenInfoDictionary extends LuceneTestCase {
@@ -178,4 +182,25 @@ public void testEnumerateAll() throws Exception {
178182
System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
179183
}
180184
}
185+
186+
// #12911: make sure our shipped binary FST for TokenInfoDictionary is the latest & greatest
187+
// format
188+
public void testBinaryFSTIsLatestFormat() throws Exception {
189+
try (InputStream is =
190+
new BufferedInputStream(
191+
TokenInfoDictionary.getClassResource(TokenInfoDictionary.FST_FILENAME_SUFFIX))) {
192+
// we only need to load the FSTMetadata to check version:
193+
int actualVersion =
194+
FST.readMetadata(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton())
195+
.getVersion();
196+
assertEquals(
197+
"TokenInfoDictionary's FST is not the latest version: expected "
198+
+ FST.VERSION_CURRENT
199+
+ " but got: "
200+
+ actualVersion
201+
+ "; run \"./gradlew :lucene:analysis:kuromoji:regenerate\" to regenerate this FST",
202+
FST.VERSION_CURRENT,
203+
actualVersion);
204+
}
205+
}
181206
}

lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ private TokenInfoDictionary(
109109
this.fst = new TokenInfoFST(fst);
110110
}
111111

112-
private static InputStream getClassResource(String suffix) throws IOException {
112+
static InputStream getClassResource(String suffix) throws IOException {
113113
final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
114114
return IOUtils.requireResourceNonNull(
115115
TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);

lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,23 @@
2121
import static org.apache.lucene.analysis.morph.BinaryDictionary.POSDICT_FILENAME_SUFFIX;
2222
import static org.apache.lucene.analysis.morph.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX;
2323

24+
import java.io.BufferedInputStream;
25+
import java.io.InputStream;
2426
import java.io.OutputStream;
2527
import java.io.OutputStreamWriter;
2628
import java.io.PrintWriter;
2729
import java.nio.charset.StandardCharsets;
2830
import java.nio.file.Files;
2931
import java.nio.file.Path;
3032
import org.apache.lucene.analysis.ko.POS;
33+
import org.apache.lucene.store.InputStreamDataInput;
3134
import org.apache.lucene.tests.util.LuceneTestCase;
3235
import org.apache.lucene.util.IntsRef;
3336
import org.apache.lucene.util.IntsRefBuilder;
3437
import org.apache.lucene.util.UnicodeUtil;
3538
import org.apache.lucene.util.fst.FST;
3639
import org.apache.lucene.util.fst.IntsRefFSTEnum;
40+
import org.apache.lucene.util.fst.PositiveIntOutputs;
3741

3842
/** Tests of TokenInfoDictionary build tools; run using ant test-tools */
3943
public class TestTokenInfoDictionary extends LuceneTestCase {
@@ -185,4 +189,25 @@ public void testEnumerateAll() throws Exception {
185189
System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
186190
}
187191
}
192+
193+
// #12911: make sure our shipped binary FST for TokenInfoDictionary is the latest & greatest
194+
// format
195+
public void testBinaryFSTIsLatestFormat() throws Exception {
196+
try (InputStream is =
197+
new BufferedInputStream(
198+
TokenInfoDictionary.getClassResource(TokenInfoDictionary.FST_FILENAME_SUFFIX))) {
199+
// we only need to load the FSTMetadata to check version:
200+
int actualVersion =
201+
FST.readMetadata(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton())
202+
.getVersion();
203+
assertEquals(
204+
"TokenInfoDictionary's FST is not the latest version: expected "
205+
+ FST.VERSION_CURRENT
206+
+ " but got: "
207+
+ actualVersion
208+
+ "; run \"./gradlew :lucene:analysis:nori:regenerate\" to regenerate this FST",
209+
FST.VERSION_CURRENT,
210+
actualVersion);
211+
}
212+
}
188213
}

lucene/core/src/java/org/apache/lucene/util/fst/FST.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1208,7 +1208,7 @@ public abstract static class BytesReader extends DataInput {
12081208
}
12091209

12101210
/**
1211-
* Represent the FST metadata
1211+
* Represents the FST metadata.
12121212
*
12131213
* @param <T> the FST output type
12141214
*/
@@ -1236,5 +1236,14 @@ public FSTMetadata(
12361236
this.version = version;
12371237
this.numBytes = numBytes;
12381238
}
1239+
1240+
/**
1241+
* Returns the version constant of the binary format this FST was written in. See the {@code
1242+
* static final int VERSION} constants in FST's javadoc, e.g. {@link
1243+
* FST#VERSION_CONTINUOUS_ARCS}.
1244+
*/
1245+
public int getVersion() {
1246+
return version;
1247+
}
12391248
}
12401249
}

0 commit comments

Comments
 (0)