Skip to content

Commit dfe0a5a

Browse files
authored
OPENNLP-1810: Fix Fix SentenceDetector fails to detect multiple identical abbreviations in the same sentence (#984)
1 parent d780617 commit dfe0a5a

File tree

2 files changed

+32
-14
lines changed

2 files changed

+32
-14
lines changed

opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -340,29 +340,34 @@ protected boolean isAcceptableBreak(CharSequence s, int fromIndex, int candidate
340340
return true;
341341

342342
final String text = s.toString();
343+
final boolean caseSensitive = abbDict.isCaseSensitive();
344+
final String searchText = caseSensitive ? text : StringUtil.toLowerCase(text);
343345
for (StringList abb : abbDict) {
344-
final String abbToken = abb.getToken(0);
345-
final int tokenStartPos = text.indexOf(abbToken, fromIndex);
346-
if (tokenStartPos == -1) {
347-
continue; // skip fast when abb is not present in text
348-
}
349-
if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) {
350-
return false; // full abbreviation match at sentence start -> no acceptable break
351-
} else {
352-
final int tokenLength = abbToken.length();
346+
final String abbToken = caseSensitive ? abb.getToken(0)
347+
: StringUtil.toLowerCase(abb.getToken(0));
348+
final int tokenLength = abbToken.length();
349+
int tokenStartPos = searchText.indexOf(abbToken, fromIndex);
350+
while (tokenStartPos != -1) {
351+
if (tokenStartPos > candidateIndex) {
352+
break; // past candidate position, no point searching further
353+
}
354+
if (tokenStartPos == 0
355+
&& searchText.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) {
356+
return false; // full abbreviation match at sentence start -> no acceptable break
357+
}
353358
final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : tokenStartPos - 1);
354-
if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos > candidateIndex ||
359+
if (tokenStartPos + tokenLength >= candidateIndex
355360
/*
356361
* Note:
357362
* Skip abbreviation candidate if regular characters exist directly before it,
358363
* That is, any letter or digit except: a whitespace, an apostrophe, or an opening round bracket.
359364
* This prevents mismatches from overlaps close to an actual sentence end.
360365
*/
361-
!(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) {
362-
363-
continue;
366+
&& (Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) {
367+
return false; // in case of a valid abbreviation: the (sentence) break is not accepted
364368
}
365-
return false; // in case of a valid abbreviation: the (sentence) break is not accepted
369+
// Try next occurrence of this abbreviation in the text
370+
tokenStartPos = searchText.indexOf(abbToken, tokenStartPos + 1);
366371
}
367372
}
368373
return true; // no abbreviation(s) at given positions: valid sentence boundary

opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,19 @@ void testSentDetectWithMultiLetterAbbreviationsAtSentenceStart() {
190190
() -> assertEquals(2, probs.length));
191191
}
192192

193+
// Edge case: The same abbreviation appears twice in a single sentence segment.
194+
@Test
195+
void testSentDetectWithDuplicateAbbreviationInSameSegment() {
196+
prepareResources(true);
197+
final String sent1 = "Lt. Vertrag und lt. Bescheid gelten andere Bedingungen.";
198+
String[] sents = sentenceDetector.sentDetect(sent1);
199+
double[] probs = sentenceDetector.probs();
200+
assertAll(
201+
() -> assertEquals(1, sents.length),
202+
() -> assertEquals(sent1, sents[0]),
203+
() -> assertEquals(1, probs.length));
204+
}
205+
193206
/*
194207
* A reproducer and test for OPENNLP-1767.
195208
* It checks that sentence detection with common abbreviations works correctly,

0 commit comments

Comments
 (0)