diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java index 55faf79c2..ec71618da 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java @@ -340,29 +340,34 @@ protected boolean isAcceptableBreak(CharSequence s, int fromIndex, int candidate return true; final String text = s.toString(); + final boolean caseSensitive = abbDict.isCaseSensitive(); + final String searchText = caseSensitive ? text : StringUtil.toLowerCase(text); for (StringList abb : abbDict) { - final String abbToken = abb.getToken(0); - final int tokenStartPos = text.indexOf(abbToken, fromIndex); - if (tokenStartPos == -1) { - continue; // skip fast when abb is not present in text - } - if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) { - return false; // full abbreviation match at sentence start -> no acceptable break - } else { - final int tokenLength = abbToken.length(); + final String abbToken = caseSensitive ? abb.getToken(0) + : StringUtil.toLowerCase(abb.getToken(0)); + final int tokenLength = abbToken.length(); + int tokenStartPos = searchText.indexOf(abbToken, fromIndex); + while (tokenStartPos != -1) { + if (tokenStartPos > candidateIndex) { + break; // past candidate position, no point searching further + } + if (tokenStartPos == 0 + && searchText.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) { + return false; // full abbreviation match at sentence start -> no acceptable break + } final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : tokenStartPos - 1); - if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos > candidateIndex || + if (tokenStartPos + tokenLength >= candidateIndex /* * Note: * Skip abbreviation candidate if regular characters exist directly before it, * That is, any letter or digit except: a whitespace, an apostrophe, or an opening round bracket. * This prevents mismatches from overlaps close to an actual sentence end. */ - !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) { - - continue; + && (Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) { + return false; // in case of a valid abbreviation: the (sentence) break is not accepted } - return false; // in case of a valid abbreviation: the (sentence) break is not accepted + // Try next occurrence of this abbreviation in the text + tokenStartPos = searchText.indexOf(abbToken, tokenStartPos + 1); } } return true; // no abbreviation(s) at given positions: valid sentence boundary diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java index 9d271ce0c..f145dd9e9 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java @@ -190,6 +190,19 @@ void testSentDetectWithMultiLetterAbbreviationsAtSentenceStart() { () -> assertEquals(2, probs.length)); } + // Edge case: The same abbreviation appears twice in a single sentence segment. + @Test + void testSentDetectWithDuplicateAbbreviationInSameSegment() { + prepareResources(true); + final String sent1 = "Lt. Vertrag und lt. Bescheid gelten andere Bedingungen."; + String[] sents = sentenceDetector.sentDetect(sent1); + double[] probs = sentenceDetector.probs(); + assertAll( + () -> assertEquals(1, sents.length), + () -> assertEquals(sent1, sents[0]), + () -> assertEquals(1, probs.length)); + } + /* * A reproducer and test for OPENNLP-1767. * It checks that sentence detection with common abbreviations works correctly,