OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false (#792)

NishantShri4 · web-flow · commit eab70aa07772 · 2025-06-24T20:58:59.000+02:00
* OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false * Added useTokenEnd to the list of optional params available for sentence detector tool.
diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
@@ -92,7 +92,7 @@ public void run(String format, String[] args) {
     try {
       Dictionary dict = loadDict(params.getAbbDict());
       SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(
-          params.getFactory(), params.getLang(), true, dict, eos);
+          params.getFactory(), params.getLang(), params.getUseTokenEnd(), dict, eos);
       model = SentenceDetectorME.train(params.getLang(), sampleStream,
           sdFactory, mlParams);
     } catch (IOException e) {
diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
@@ -44,4 +44,9 @@ interface TrainingParams extends BasicTrainingParams {
       description = "A sub-class of SentenceDetectorFactory where to get implementation and resources.")
   @OptionalParameter
   String getFactory();
+
+  @ParameterDescription(valueName = "useTokenEnd",
+      description = "A boolean parameter to detect the start index of the next sentence in the test data.")
+  @OptionalParameter(defaultValue = "true")
+  Boolean getUseTokenEnd();
 }
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -20,12 +20,16 @@
 import java.io.IOException;
 import java.util.Locale;
 
-import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 
 import opennlp.tools.dictionary.Dictionary;
 
+import static org.junit.jupiter.api.Assertions.assertAll;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.fail;
+
 /**
  * Tests for the {@link SentenceDetectorME} class.
  * <p>
@@ -42,64 +46,99 @@
 public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest {
 
   private static final char[] EOS_CHARS = {'.', '?', '!'};
-  
-  private static SentenceModel sentdetectModel;
+  private static Dictionary abbreviationDict;
+  private SentenceModel sentdetectModel;
 
   @BeforeAll
-  public static void prepareResources() throws IOException {
-    Dictionary abbreviationDict = loadAbbDictionary(Locale.GERMAN);
-    SentenceDetectorFactory factory = new SentenceDetectorFactory(
-            "deu", true, abbreviationDict, EOS_CHARS);
-    sentdetectModel = train(factory, Locale.GERMAN);
-    Assertions.assertNotNull(sentdetectModel);
-    Assertions.assertEquals("deu", sentdetectModel.getLanguage());
+  static void loadResources() throws IOException {
+    abbreviationDict = loadAbbDictionary(Locale.GERMAN);
+  }
+
+  private void prepareResources(boolean useTokenEnd) {
+    try {
+      SentenceDetectorFactory factory = new SentenceDetectorFactory(
+          "deu", useTokenEnd, abbreviationDict, EOS_CHARS);
+      sentdetectModel = train(factory, Locale.GERMAN);
+
+      assertAll(() -> assertNotNull(sentdetectModel),
+          () -> assertEquals("deu", sentdetectModel.getLanguage()));
+    } catch (IOException ex) {
+      fail("Couldn't train the SentenceModel using test data. Exception: " + ex.getMessage());
+    }
   }
 
   // Example taken from 'Sentences_DE.txt'
   @Test
   void testSentDetectWithInlineAbbreviationsEx1() {
+    prepareResources(true);
+
     final String sent1 = "Ein Traum, zu dessen Bildung eine besonders starke Verdichtung beigetragen, " +
-            "wird für diese Untersuchung das günstigste Material sein.";
+        "wird für diese Untersuchung das günstigste Material sein.";
     // Here we have two abbreviations "S. = Seite" and "ff. = folgende (Plural)"
     final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";
 
     SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
     String sampleSentences = sent1 + " " + sent2;
     String[] sents = sentDetect.sentDetect(sampleSentences);
-    Assertions.assertEquals(2, sents.length);
-    Assertions.assertEquals(sent1, sents[0]);
-    Assertions.assertEquals(sent2, sents[1]);
     double[] probs = sentDetect.getSentenceProbabilities();
-    Assertions.assertEquals(2, probs.length);
+
+    assertAll(() -> assertEquals(2, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(sent2, sents[1]),
+        () -> assertEquals(2, probs.length));
   }
 
   // Reduced example taken from 'Sentences_DE.txt'
   @Test
   void testSentDetectWithInlineAbbreviationsEx2() {
+    prepareResources(true);
+
     // Here we have three abbreviations: "S. = Seite", "vgl. = vergleiche", and "f. = folgende (Singular)"
     final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
-            "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
+        "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
 
     SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
     String[] sents = sentDetect.sentDetect(sent1);
-    Assertions.assertEquals(1, sents.length);
-    Assertions.assertEquals(sent1, sents[0]);
     double[] probs = sentDetect.getSentenceProbabilities();
-    Assertions.assertEquals(1, probs.length);
+
+    assertAll(() -> assertEquals(1, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(1, probs.length));
   }
 
   // Modified example deduced from 'Sentences_DE.txt'
   @Test
   void testSentDetectWithInlineAbbreviationsEx3() {
+    prepareResources(true);
+
     // Here we have two abbreviations "z. B. = zum Beispiel" and "S. = Seite"
     final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
-            "geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
+        "geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
 
     SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
     String[] sents = sentDetect.sentDetect(sent1);
-    Assertions.assertEquals(1, sents.length);
-    Assertions.assertEquals(sent1, sents[0]);
     double[] probs = sentDetect.getSentenceProbabilities();
-    Assertions.assertEquals(1, probs.length);
+
+    assertAll(() -> assertEquals(1, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(1, probs.length));
+  }
+
+  @Test
+  void testSentDetectWithUseTokenEndFalse() {
+    prepareResources(false);
+
+    final String sent1 = "Träume sind eine Verbindung von Gedanken.";
+    final String sent2 = "Verschiedene Gedanken sind während der Traumformation aktiv.";
+
+    SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
+    //There is no blank space before start of the second sentence.
+    String[] sents = sentDetect.sentDetect(sent1 + sent2);
+    double[] probs = sentDetect.getSentenceProbabilities();
+
+    assertAll(() -> assertEquals(2, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(sent2, sents[1]),
+        () -> assertEquals(2, probs.length));
   }
 }
diff --git a/opennlp-docs/src/docbkx/sentdetect.xml b/opennlp-docs/src/docbkx/sentdetect.xml
@@ -142,7 +142,10 @@ Arguments description:
         -data sampleData
                 data to be used, usually a file name.
         -encoding charsetName
-                encoding for reading and writing text, if absent the system default is used.]]>
+                encoding for reading and writing text, if absent the system default is used.
+        -useTokenEnd boolean flag
+                set to false when the next sentence in the test dataset doesn't start with a blank space post completion of
+                the previous sentence. If absent, it is defaulted to true.]]>
 	</screen>
 		To train an English sentence detector use the following command:
         <screen>