OPENNLP-1753: Switch to pre-trained Model binaries v1.3 (OpenNLP 2.x) (#810)

mawiesne · web-flow · commit adc38faf1191 · 2025-07-07T19:05:35.000+02:00
- provides and adapts PR #799 for OpenNLP 2.x maintenance branch - adjusts version strings - adjusts index.html template to latest v1.3 copy for DownloadParserTest - adjusts examples in Dev manual
diff --git a/opennlp-docs/src/docbkx/lemmatizer.xml b/opennlp-docs/src/docbkx/lemmatizer.xml
@@ -41,7 +41,7 @@
 			<para>
 				<screen>
 		   <![CDATA[
-$ opennlp LemmatizerME opennlp-en-ud-ewt-lemmas-1.2-2.5.0.bin < sentences]]>
+$ opennlp LemmatizerME opennlp-en-ud-ewt-lemmas-1.3-2.5.4.bin < sentences]]>
 		  </screen>
 				The Lemmatizer now reads a pos tagged sentence(s) per line from
 				standard input. For example, you can copy this sentence to the
@@ -89,7 +89,7 @@ signed	VERB	sign
 				<programlisting language="java">
 		<![CDATA[
 LemmatizerModel model = null;
-try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-lemmas-1.2-2.5.0.bin"))) {
+try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-lemmas-1.3-2.5.4.bin"))) {
   model = new LemmatizerModel(modelIn);
 }
 ]]>
diff --git a/opennlp-docs/src/docbkx/postagger.xml b/opennlp-docs/src/docbkx/postagger.xml
@@ -41,7 +41,7 @@ under the License.
 		Download the English maxent pos model and start the POS Tagger Tool with this command:
 		<screen>
 			<![CDATA[
-$ opennlp POSTagger opennlp-en-ud-ewt-pos-1.2-2.5.0.bin]]>
+$ opennlp POSTagger opennlp-en-ud-ewt-pos-1.3-2.5.4.bin]]>
 		 </screen>
 		The POS Tagger now reads a tokenized sentence per line from stdin.
 		Copy these two sentences to the console:
@@ -69,7 +69,7 @@ Mr._PROPN Vinken_PROPN is_AUX chairman_NOUN of_ADP Elsevier_ADJ N.V._PROPN ,_PUN
 			In the sample below it is loaded from disk.
 			<programlisting language="java">
 				<![CDATA[
-try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-pos-1.2-2.5.0.bin"){
+try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-pos-1.3-2.5.4.bin"){
   POSModel model = new POSModel(modelIn);
 }]]>
 			</programlisting>
@@ -343,7 +343,7 @@ Arrays.stream(tags).forEach(k -> System.out.print(k + " "));]]>
 				    POS Tags using the custom model (en-custom-pos-maxent.bin): PROPN OTHER PRON ADJ NOUN PUNCT
 
 				Output with the default model
-				    POS Tags using the default model (opennlp-en-ud-ewt-pos-1.2-2.5.0.bin):	NOUN AUX PRON ADJ NOUN PUNCT
+				    POS Tags using the default model (opennlp-en-ud-ewt-pos-1.3-2.5.4.bin):	NOUN AUX PRON ADJ NOUN PUNCT
 			</literallayout>
 		</para>
 		</section>
diff --git a/opennlp-docs/src/docbkx/sentdetect.xml b/opennlp-docs/src/docbkx/sentdetect.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
-				"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
-				]>
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+]>
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
@@ -57,31 +57,31 @@ Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC,
 		</para>
 
 		<section id="tools.sentdetect.detection.cmdline">
-			<title>Sentence Detection Tool</title>
-			<para>
-				The easiest way to try out the Sentence Detector is the command line tool. The tool is only intended for demonstration and testing.
-				Download the english sentence detector model and start the Sentence Detector Tool with this command:
-				<screen>
-					<![CDATA[
-$ opennlp SentenceDetector opennlp-en-ud-ewt-sentence-1.2-2.5.0.bin]]>
-				</screen>
-				Just copy the sample text from above to the console. The Sentence Detector will read it and echo one sentence per line to the console.
-				Usually the input is read from a file and the output is redirected to another file. This can be achieved with the following command.
-				<screen>
-					<![CDATA[
-$ opennlp SentenceDetector opennlp-en-ud-ewt-sentence-1.2-2.5.0.bin < input.txt > output.txt]]>
-				</screen>
-				For the english sentence model from the website the input text should not be tokenized.
-			</para>
+		  <title>Sentence Detection Tool</title>
+		  <para>
+		    The easiest way to try out the Sentence Detector is the command line tool. The tool is only intended for demonstration and testing.
+		    Download the english sentence detector model and start the Sentence Detector Tool with this command:
+        <screen>
+          <![CDATA[
+$ opennlp SentenceDetector opennlp-en-ud-ewt-sentence-1.3-2.5.4.bin]]>
+		    </screen>
+		    Just copy the sample text from above to the console. The Sentence Detector will read it and echo one sentence per line to the console.
+		    Usually the input is read from a file and the output is redirected to another file. This can be achieved with the following command.
+		    <screen>
+				  <![CDATA[
+$ opennlp SentenceDetector opennlp-en-ud-ewt-sentence-1.3-2.5.4.bin < input.txt > output.txt]]>
+		    </screen>
+		    For the english sentence model from the website the input text should not be tokenized.
+		  </para>
 		</section>
 		<section id="tools.sentdetect.detection.api">
-			<title>Sentence Detection API</title>
-			<para>
-				The Sentence Detector can be easily integrated into an application via its API.
-				To instantiate the Sentence Detector the sentence model must be loaded first.
-				<programlisting language="java">
-					<![CDATA[
-try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-sentence-1.2-2.5.0.bin")) {
+		  <title>Sentence Detection API</title>
+		  <para>
+		    The Sentence Detector can be easily integrated into an application via its API.
+		    To instantiate the Sentence Detector the sentence model must be loaded first.
+		    <programlisting language="java">
+				<![CDATA[
+try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-sentence-1.3-2.5.4.bin")) {
   SentenceModel model = new SentenceModel(modelIn);
 }]]>
 				</programlisting>
diff --git a/opennlp-docs/src/docbkx/tokenizer.xml b/opennlp-docs/src/docbkx/tokenizer.xml
@@ -97,7 +97,7 @@ $ opennlp SimpleTokenizer]]>
 			our website.
 			<screen>
 			<![CDATA[
-$ opennlp TokenizerME opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin]]>
+$ opennlp TokenizerME opennlp-en-ud-ewt-tokens-1.3-2.5.4.bin]]>
 		    </screen>
 			To test the tokenizer copy the sample from above to the console. The
 			whitespace separated tokens will be written back to the
@@ -107,7 +107,7 @@ $ opennlp TokenizerME opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin]]>
 			Usually the input is read from a file and written to a file.
 			<screen>
 			<![CDATA[
-$ opennlp TokenizerME opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin < article.txt > article-tokenized.txt]]>
+$ opennlp TokenizerME opennlp-en-ud-ewt-tokens-1.3-2.5.4.bin < article.txt > article-tokenized.txt]]>
 		    </screen>
 			It can be done in the same way for the Simple Tokenizer.
 		</para>
@@ -151,7 +151,7 @@ London share prices were bolstered largely by continued gains on Wall Street and
 			can be loaded.
 			<programlisting language="java">
 			<![CDATA[
-try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin")) {
+try (InputStream modelIn = new FileInputStream("opennlp-en-ud-ewt-tokens-1.3-2.5.4.bin")) {
   TokenizerModel model = new TokenizerModel(modelIn);
 }]]>
 		 </programlisting>
diff --git a/opennlp-tools/src/main/java/opennlp/tools/monitoring/StopCriteria.java b/opennlp-tools/src/main/java/opennlp/tools/monitoring/StopCriteria.java
@@ -19,18 +19,14 @@
 
 import java.util.function.Predicate;
 
-import opennlp.tools.ml.model.AbstractModel;
-
-
 /**
  * Stop criteria for model training. If the predicate is met, then the training is aborted.
  *
  * @see Predicate
- * @see AbstractModel
  */
 public interface StopCriteria<T extends Number> extends Predicate<T> {
 
-  String FINISHED = "Training Finished after completing %s Iterations successfully.";
+  String FINISHED = "Training finished after completing %s iterations successfully.";
 
   /**
    * @return A detailed message captured upon hitting the {@link StopCriteria} during model training.
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
@@ -56,9 +56,9 @@ public class DownloadUtil {
   private static final Logger logger = LoggerFactory.getLogger(DownloadUtil.class);
 
   private static final String BASE_URL =
-      System.getProperty("OPENNLP_DOWNLOAD_BASE_URL", "https://dlcdn.apache.org/opennlp/");
+          System.getProperty("OPENNLP_DOWNLOAD_BASE_URL", "https://dlcdn.apache.org/opennlp/");
   private static final String MODEL_URI_PATH =
-      System.getProperty("OPENNLP_DOWNLOAD_MODEL_PATH", "models/ud-models-1.2/");
+          System.getProperty("OPENNLP_DOWNLOAD_MODEL_PATH", "models/ud-models-1.3/");
   private static final String OPENNLP_DOWNLOAD_HOME = "OPENNLP_DOWNLOAD_HOME";
 
   private static Map<String, Map<ModelType, String>> availableModels;
@@ -202,7 +202,7 @@ private static void validateModel(URL sha512, Path downloadedModel) throws IOExc
     final String actualChecksum = calculateSHA512(downloadedModel);
     if (!actualChecksum.equalsIgnoreCase(expectedChecksum)) {
       throw new IOException("SHA512 checksum validation failed for " + downloadedModel.getFileName() +
-          ". Expected: " + expectedChecksum + ", but got: " + actualChecksum);
+              ". Expected: " + expectedChecksum + ", but got: " + actualChecksum);
     }
   }
 
@@ -353,7 +353,7 @@ private void addModel(String locale, String link, Map<String, Map<ModelType, Str
     private String fetchPageIndex() {
       final StringBuilder html = new StringBuilder();
       try (BufferedReader br = new BufferedReader(
-          new InputStreamReader(indexUrl.openStream(), StandardCharsets.UTF_8))) {
+              new InputStreamReader(indexUrl.openStream(), StandardCharsets.UTF_8))) {
         String line;
         while ((line = br.readLine()) != null) {
           html.append(line);
diff --git a/opennlp-tools/src/test/java/opennlp/tools/AbstractModelLoaderTest.java b/opennlp-tools/src/test/java/opennlp/tools/AbstractModelLoaderTest.java
@@ -38,7 +38,7 @@ public abstract class AbstractModelLoaderTest {
   private static final String BASE_URL_MODELS_V183 = "https://dlcdn.apache.org/opennlp/models/langdetect/1.8.3/";
   protected static final Path OPENNLP_DIR = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
           System.getProperty("user.home"))).resolve(".opennlp");
-  protected static final String VER = "1.2-2.5.0";
+  protected static final String VER = "1.3-2.5.4";
   protected static final String BIN = ".bin";
   protected static List<String> SUPPORTED_LANG_CODES = List.of(
           "en", "fr", "de", "it", "nl", "bg", "ca", "cs", "da", "el",
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/lemmatizer/LemmatizerModelLoaderIT.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/lemmatizer/LemmatizerModelLoaderIT.java
@@ -56,7 +56,7 @@ public void setup() {
 
   @ParameterizedTest(name = "Verify \"{0}\" tokenizer model loading")
   @ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
-      "bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
+      "bg-ud-btb", "ca-ud-ancora", "cs-ud-pdtc", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
       "eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
       "ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
       "sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/postag/POSModelLoaderIT.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/postag/POSModelLoaderIT.java
@@ -56,7 +56,7 @@ public void setup() {
 
   @ParameterizedTest(name = "Verify \"{0}\" POS model loading")
   @ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
-      "bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
+      "bg-ud-btb", "ca-ud-ancora", "cs-ud-pdtc", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
       "eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
       "ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
       "sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/sentdetect/SentenceModelLoaderIT.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/sentdetect/SentenceModelLoaderIT.java
@@ -56,7 +56,7 @@ public void setup() {
 
   @ParameterizedTest(name = "Verify \"{0}\" sentence model loading")
   @ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
-      "bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
+      "bg-ud-btb", "ca-ud-ancora", "cs-ud-pdtc", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
       "eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
       "ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
       "sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerModelLoaderIT.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerModelLoaderIT.java
@@ -56,7 +56,7 @@ public void setup() {
 
   @ParameterizedTest(name = "Verify \"{0}\" tokenizer model loading")
   @ValueSource(strings = {"en-ud-ewt", "fr-ud-gsd", "de-ud-gsd", "it-ud-vit", "nl-ud-alpino",
-      "bg-ud-btb", "ca-ud-ancora", "cs-ud-pdt", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
+      "bg-ud-btb", "ca-ud-ancora", "cs-ud-pdtc", "da-ud-ddt", "el-ud-gdt", "es-ud-gsd", "et-ud-edt",
       "eu-ud-bdt", "fi-ud-tdt", "hr-ud-set", "hy-ud-bsut", "is-ud-icepahc", "ka-ud-glc", "kk-ud-ktb",
       "ko-ud-kaist", "lv-ud-lvtb", "no-ud-bokmaal", "pl-ud-pdb", "pt-ud-gsd", "ro-ud-rrt", "ru-ud-gsd",
       "sr-ud-set", "sk-ud-snk", "sl-ud-ssj", "sv-ud-talbanken", "tr-ud-boun", "uk-ud-iu"})
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactoryTest.java
@@ -49,7 +49,7 @@ public class TwentyNewsgroupSampleStreamFactoryTest extends
 
   private static final Path OPENNLP_DIR = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
           System.getProperty("user.home"))).resolve(".opennlp");
-  private static final String TOKENIZER_MODEL_NAME = "opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin";
+  private static final String TOKENIZER_MODEL_NAME = "opennlp-en-ud-ewt-tokens-1.3-2.5.4.bin";
 
   // SUT
   private TwentyNewsgroupSampleStreamFactory factory;
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamFactoryTest.java
@@ -52,7 +52,7 @@ public class BratNameSampleStreamFactoryTest extends
 
   private static final Path OPENNLP_DIR = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
           System.getProperty("user.home"))).resolve(".opennlp");
-  private static final String TOKENIZER_MODEL_NAME = "opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin";
+  private static final String TOKENIZER_MODEL_NAME = "opennlp-en-ud-ewt-tokens-1.3-2.5.4.bin";
 
   // SUT
   private BratNameSampleStreamFactory factory;
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactoryTest.java
@@ -48,7 +48,7 @@ public class Muc6NameSampleStreamFactoryTest extends
 
   private static final Path OPENNLP_DIR = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
           System.getProperty("user.home"))).resolve(".opennlp");
-  private static final String TOKENIZER_MODEL_NAME = "opennlp-en-ud-ewt-tokens-1.2-2.5.0.bin";
+  private static final String TOKENIZER_MODEL_NAME = "opennlp-en-ud-ewt-tokens-1.3-2.5.4.bin";
 
   // SUT
   private Muc6NameSampleStreamFactory factory;
diff --git a/opennlp-tools/src/test/java/opennlp/tools/monitoring/DefaultTrainingProgressMonitorTest.java b/opennlp-tools/src/test/java/opennlp/tools/monitoring/DefaultTrainingProgressMonitorTest.java
@@ -79,7 +79,7 @@ void testFinishedTrainingWithoutStopCriteria() {
 
       //Assert that the logs captured the training completion message when all iterations are exhausted.
       List<String> actual = logCaptor.getInfoLogs();
-      List<String> expected = List.of("Training Finished after completing 150 Iterations successfully.");
+      List<String> expected = List.of("Training finished after completing 150 iterations successfully.");
       assertArrayEquals(expected.toArray(), actual.toArray());
     }
   }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadParserTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadParserTest.java
@@ -84,7 +84,7 @@ private URL fromClasspath(String file) {
   private static final String MODEL_SENT = "sentence-";
   private static final String MODEL_TOK = "tokens-";
   private static final String MODEL_POS = "pos-";
-  private static final String VER = "1.2-2.5.0";
+  private static final String VER = "1.3-2.5.4";
   private static final String BIN = ".bin";
 
   // Note: This needs to be public as JUnit 5 requires it like this.
@@ -116,9 +116,9 @@ public static Stream<Arguments> expectedModels() {
         ModelType.TOKENIZER, OPENNLP + "bg-ud-btb-" + MODEL_TOK + VER + BIN,
         ModelType.POS, OPENNLP + "bg-ud-btb-" + MODEL_POS + VER + BIN)),
       Arguments.of("cs", Map.of(
-        ModelType.SENTENCE_DETECTOR, OPENNLP + "cs-ud-pdt-" + MODEL_SENT + VER + BIN,
-        ModelType.TOKENIZER, OPENNLP + "cs-ud-pdt-" + MODEL_TOK + VER + BIN,
-        ModelType.POS, OPENNLP + "cs-ud-pdt-" + MODEL_POS + VER + BIN)),
+        ModelType.SENTENCE_DETECTOR, OPENNLP + "cs-ud-pdtc-" + MODEL_SENT + VER + BIN,
+        ModelType.TOKENIZER, OPENNLP + "cs-ud-pdtc-" + MODEL_TOK + VER + BIN,
+        ModelType.POS, OPENNLP + "cs-ud-pdtc-" + MODEL_POS + VER + BIN)),
       Arguments.of("da", Map.of(
         ModelType.SENTENCE_DETECTOR, OPENNLP + "da-ud-ddt-" + MODEL_SENT + VER + BIN,
         ModelType.TOKENIZER, OPENNLP + "da-ud-ddt-" + MODEL_TOK + VER + BIN,
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/util/index.html b/opennlp-tools/src/test/resources/opennlp/tools/util/index.html
diff --git a/opennlp-uima/src/test/java/opennlp/uima/util/OpennlpUtilTest.java b/opennlp-uima/src/test/java/opennlp/uima/util/OpennlpUtilTest.java
diff --git a/opennlp-uima/src/test/resources/test-descriptors/OpenNlpTextAnalyzer.xml b/opennlp-uima/src/test/resources/test-descriptors/OpenNlpTextAnalyzer.xml
diff --git a/opennlp-uima/src/test/resources/test-descriptors/PosTagger.xml b/opennlp-uima/src/test/resources/test-descriptors/PosTagger.xml
diff --git a/opennlp-uima/src/test/resources/test-descriptors/SentenceDetector.xml b/opennlp-uima/src/test/resources/test-descriptors/SentenceDetector.xml
diff --git a/opennlp-uima/src/test/resources/test-descriptors/Tokenizer.xml b/opennlp-uima/src/test/resources/test-descriptors/Tokenizer.xml
diff --git a/pom.xml b/pom.xml