OPENNLP-1809: SentenceDetector misses multi-letter abbreviations at sentence start (#983)

mawiesne · mawiesne · commit 5b690d73f49d · 2026-03-22T12:44:31.000+01:00
- adds reproducer & test - fixes the issue in SentenceDetectorME#isAcceptableBreak(..) - refactors some code in other spots (cherry picked from commit d780617)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java b/opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java
@@ -226,7 +226,7 @@ public void startPrefixMapping(String prefix, String uri)
   public static boolean create(InputStream in, EntryInserter inserter)
       throws IOException {
 
-    DictionaryContenthandler profileContentHandler = new DictionaryContenthandler(inserter);
+    DictionaryContenthandler handler = new DictionaryContenthandler(inserter);
 
     XMLReader xmlReader;
     try {
@@ -235,14 +235,14 @@ public static boolean create(InputStream in, EntryInserter inserter)
       // There is a compatibility problem here: JAXP default is false while SAX 2 default is true!
       // OpenNLP requires it activated!
       xmlReader.setFeature(SAX_FEATURE_NAMESPACES, true);
-      xmlReader.setContentHandler(profileContentHandler);
+      xmlReader.setContentHandler(handler);
       xmlReader.parse(new InputSource(new UncloseableInputStream(in)));
     }
     catch (ParserConfigurationException | SAXException e) {
       throw new InvalidFormatException("The profile data stream has " +
           "an invalid format!", e);
     }
-    return profileContentHandler.mIsCaseSensitiveDictionary;
+    return handler.mIsCaseSensitiveDictionary;
   }
 
   /**
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -339,27 +339,31 @@ protected boolean isAcceptableBreak(CharSequence s, int fromIndex, int candidate
     if (abbDict == null)
       return true;
 
+    final String text = s.toString();
     for (StringList abb : abbDict) {
-      final String token = abb.getToken(0);
-      final int tokenPosition = s.toString().indexOf(token, fromIndex);
-      if (tokenPosition == -1) {
-        continue; // skip fast
+      final String abbToken = abb.getToken(0);
+      final int tokenStartPos = text.indexOf(abbToken, fromIndex);
+      if (tokenStartPos == -1) {
+        continue; // skip fast when abb is not present in text
       }
-
-      final char prevChar = s.charAt(tokenPosition == 0 ? tokenPosition : tokenPosition - 1);
-      int tokenLength = token.length();
-      if (tokenPosition + tokenLength < candidateIndex || tokenPosition > candidateIndex ||
-        /*
-         * Note:
-         * Skip abbreviation candidate if regular characters exist directly before it,
-         * That is, any letter or digit except: a whitespace, an apostrophe, or an opening round bracket.
-         * This prevents mismatches from overlaps close to an actual sentence end.
-         */
-          !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) {
-
-        continue;
+      if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) {
+        return false; // full abbreviation match at sentence start -> no acceptable break
+      } else {
+        final int tokenLength = abbToken.length();
+        final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : tokenStartPos - 1);
+        if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos > candidateIndex ||
+          /*
+           * Note:
+           * Skip abbreviation candidate if regular characters exist directly before it,
+           * That is, any letter or digit except: a whitespace, an apostrophe, or an opening round bracket.
+           * This prevents mismatches from overlaps close to an actual sentence end.
+           */
+            !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) {
+
+          continue;
+        }
+        return false; // in case of a valid abbreviation: the (sentence) break is not accepted
       }
-      return false; // in case of a valid abbreviation: the (sentence) break is not accepted
     }
     return true; // no abbreviation(s) at given positions: valid sentence boundary
   }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -152,10 +152,10 @@ void testSentDetectWithUseTokenEndFalse() {
   }
 
   /*
-    * A reproducer and test for OPENNLP-1781.
+   * A reproducer and test for OPENNLP-1781.
    */
   @Test
-  void testSentDetectWithAbbreviationsAtSentenceStart() {
+  void testSentDetectWithSingleLetterAbbreviationsAtSentenceStart() {
     prepareResources(true);
 
     final String sent1 = "S. Träume sind eine Verbindung von Gedanken.";
@@ -169,6 +169,27 @@ void testSentDetectWithAbbreviationsAtSentenceStart() {
         () -> assertEquals(1, probs.length));
   }
 
+  /*
+   * A reproducer and test for OPENNLP-1809.
+   */
+  @Test
+  void testSentDetectWithMultiLetterAbbreviationsAtSentenceStart() {
+    prepareResources(true);
+
+    final String sent1 = "Bek. Problem: Schlafmangel.";
+    final String sent2 = "Über die letzten Tage hinweg war sie zunehmend müde.";
+
+    String sampleSentences = sent1 + " " + sent2;
+    String[] sents = sentenceDetector.sentDetect(sampleSentences);
+    double[] probs = sentenceDetector.probs();
+
+    assertAll(
+        () -> assertEquals(2, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(sent2, sents[1]),
+        () -> assertEquals(2, probs.length));
+  }
+
   /*
    * A reproducer and test for OPENNLP-1767.
    * It checks that sentence detection with common abbreviations works correctly,
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
@@ -210,8 +210,8 @@ void checkCustomPatternForTokenizerME(String lang, String pattern, String senten
 
     Assertions.assertEquals(expectedNumTokens, tokens.length);
     String[] sentSplit = sentence
-            .replaceAll("'", " '")
-            .replaceAll(",", " ,")
+            .replace("'", " '")
+            .replace(",", " ,")
             .split(" ");
     for (int i = 0; i < sentSplit.length; i++) {
       String sElement = sentSplit[i];
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml b/opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml
@@ -47,4 +47,7 @@
   <entry>
     <token>z.B.</token>
   </entry>
+  <entry>
+    <token>Bek.</token>
+  </entry>
 </dictionary>