OPENNLP-1810: Fix SentenceDetector fails to detect multiple identical abbreviations in the same sentence (#984)

rzo1 · mawiesne · commit db4f2de1457f · 2026-03-22T15:41:35.000+01:00
(cherry picked from commit dfe0a5a)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -340,29 +340,34 @@ protected boolean isAcceptableBreak(CharSequence s, int fromIndex, int candidate
       return true;
 
     final String text = s.toString();
+    final boolean caseSensitive = abbDict.isCaseSensitive();
+    final String searchText = caseSensitive ? text : StringUtil.toLowerCase(text);
     for (StringList abb : abbDict) {
-      final String abbToken = abb.getToken(0);
-      final int tokenStartPos = text.indexOf(abbToken, fromIndex);
-      if (tokenStartPos == -1) {
-        continue; // skip fast when abb is not present in text
-      }
-      if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) {
-        return false; // full abbreviation match at sentence start -> no acceptable break
-      } else {
-        final int tokenLength = abbToken.length();
+      final String abbToken = caseSensitive ? abb.getToken(0)
+          : StringUtil.toLowerCase(abb.getToken(0));
+      final int tokenLength = abbToken.length();
+      int tokenStartPos = searchText.indexOf(abbToken, fromIndex);
+      while (tokenStartPos != -1) {
+        if (tokenStartPos > candidateIndex) {
+          break; // past candidate position, no point searching further
+        }
+        if (tokenStartPos == 0
+            && searchText.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) {
+          return false; // full abbreviation match at sentence start -> no acceptable break
+        }
         final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : tokenStartPos - 1);
-        if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos > candidateIndex ||
+        if (tokenStartPos + tokenLength >= candidateIndex
           /*
            * Note:
            * Skip abbreviation candidate if regular characters exist directly before it,
            * That is, any letter or digit except: a whitespace, an apostrophe, or an opening round bracket.
            * This prevents mismatches from overlaps close to an actual sentence end.
            */
-            !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) {
-
-          continue;
+            && (Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) {
+          return false; // in case of a valid abbreviation: the (sentence) break is not accepted
         }
-        return false; // in case of a valid abbreviation: the (sentence) break is not accepted
+        // Try next occurrence of this abbreviation in the text
+        tokenStartPos = searchText.indexOf(abbToken, tokenStartPos + 1);
       }
     }
     return true; // no abbreviation(s) at given positions: valid sentence boundary
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -190,6 +190,19 @@ void testSentDetectWithMultiLetterAbbreviationsAtSentenceStart() {
         () -> assertEquals(2, probs.length));
   }
 
+  // Edge case: The same abbreviation appears twice in a single sentence segment.
+  @Test
+  void testSentDetectWithDuplicateAbbreviationInSameSegment() {
+    prepareResources(true);
+    final String sent1 = "Lt. Vertrag und lt. Bescheid gelten andere Bedingungen.";
+    String[] sents = sentenceDetector.sentDetect(sent1);
+    double[] probs = sentenceDetector.probs();
+    assertAll(
+        () -> assertEquals(1, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(1, probs.length));
+  }
+
   /*
    * A reproducer and test for OPENNLP-1767.
    * It checks that sentence detection with common abbreviations works correctly,