OPENNLP-1811: Fix SentenceDetector missing abbreviations at non-first sentence start with useTokenEnd=false

rzo1 · rzo1 · commit 56c5bea8b3a0 · 2026-03-22T13:14:50.000+01:00
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -204,7 +204,13 @@ public Span[] sentPosDetect(CharSequence s) {
       // skip over the leading parts of non-token final delimiters
       int fws = getFirstWS(s,cint + 1);
       if (i + 1 < end && enders.get(i + 1) < fws) {
-        continue;
+        // Do not skip if the character right after the delimiter is uppercase,
+        // as this likely indicates the start of a new sentence (e.g., "Gedanken.Bek.")
+        // rather than a multi-period abbreviation (e.g., "z.B.").
+        int nextCharIdx = cint + 1;
+        if (nextCharIdx >= s.length() || !Character.isUpperCase(s.charAt(nextCharIdx))) {
+          continue;
+        }
       }
       if (positions.size() > 0 && cint < positions.get(positions.size() - 1)) continue;
 
@@ -351,11 +357,11 @@ protected boolean isAcceptableBreak(CharSequence s, int fromIndex, int candidate
         if (tokenStartPos > candidateIndex) {
           break; // past candidate position, no point searching further
         }
-        if (tokenStartPos == 0
+        if (tokenStartPos == fromIndex
             && searchText.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) {
-          return false; // full abbreviation match at sentence start -> no acceptable break
+          return false; // full abbreviation match at segment start -> no acceptable break
         }
-        final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : tokenStartPos - 1);
+        final char prevChar = s.charAt(tokenStartPos == fromIndex ? tokenStartPos : tokenStartPos - 1);
         if (tokenStartPos + tokenLength >= candidateIndex
           /*
            * Note:
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -203,6 +203,26 @@ void testSentDetectWithDuplicateAbbreviationInSameSegment() {
         () -> assertEquals(1, probs.length));
   }
 
+  /**
+   * Edge case: Multi-letter abbreviation at the start of a non-first sentence
+   * with {@code useTokenEnd = false} (no space between sentences).
+   */
+  @Test
+  void testSentDetectWithMultiLetterAbbreviationAtNonFirstSentenceStart() {
+    prepareResources(false);
+    final String sent1 = "Träume sind eine Verbindung von Gedanken.";
+    final String sent2 = "Bek. Problem: Schlafmangel.";
+    // No space between sentences (useTokenEnd=false supports this)
+    String sampleSentences = sent1 + sent2;
+    String[] sents = sentenceDetector.sentDetect(sampleSentences);
+    double[] probs = sentenceDetector.probs();
+    assertAll(
+        () -> assertEquals(2, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(sent2, sents[1]),
+        () -> assertEquals(2, probs.length));
+  }
+
   /*
    * A reproducer and test for OPENNLP-1767.
    * It checks that sentence detection with common abbreviations works correctly,