Skip to content

Commit 56c5bea

Browse files
committed
OPENNLP-1811: Fix SentenceDetector missing abbreviations at non-first sentence start with useTokenEnd=false
1 parent 60a9d47 commit 56c5bea

File tree

2 files changed

+30
-4
lines changed

2 files changed

+30
-4
lines changed

opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,13 @@ public Span[] sentPosDetect(CharSequence s) {
204204
// skip over the leading parts of non-token final delimiters
205205
int fws = getFirstWS(s,cint + 1);
206206
if (i + 1 < end && enders.get(i + 1) < fws) {
207-
continue;
207+
// Do not skip if the character right after the delimiter is uppercase,
208+
// as this likely indicates the start of a new sentence (e.g., "Gedanken.Bek.")
209+
// rather than a multi-period abbreviation (e.g., "z.B.").
210+
int nextCharIdx = cint + 1;
211+
if (nextCharIdx >= s.length() || !Character.isUpperCase(s.charAt(nextCharIdx))) {
212+
continue;
213+
}
208214
}
209215
if (positions.size() > 0 && cint < positions.get(positions.size() - 1)) continue;
210216

@@ -351,11 +357,11 @@ protected boolean isAcceptableBreak(CharSequence s, int fromIndex, int candidate
351357
if (tokenStartPos > candidateIndex) {
352358
break; // past candidate position, no point searching further
353359
}
354-
if (tokenStartPos == 0
360+
if (tokenStartPos == fromIndex
355361
&& searchText.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) {
356-
return false; // full abbreviation match at sentence start -> no acceptable break
362+
return false; // full abbreviation match at segment start -> no acceptable break
357363
}
358-
final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : tokenStartPos - 1);
364+
final char prevChar = s.charAt(tokenStartPos == fromIndex ? tokenStartPos : tokenStartPos - 1);
359365
if (tokenStartPos + tokenLength >= candidateIndex
360366
/*
361367
* Note:

opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,26 @@ void testSentDetectWithDuplicateAbbreviationInSameSegment() {
203203
() -> assertEquals(1, probs.length));
204204
}
205205

206+
/**
207+
* Edge case: Multi-letter abbreviation at the start of a non-first sentence
208+
* with {@code useTokenEnd = false} (no space between sentences).
209+
*/
210+
@Test
211+
void testSentDetectWithMultiLetterAbbreviationAtNonFirstSentenceStart() {
212+
prepareResources(false);
213+
final String sent1 = "Träume sind eine Verbindung von Gedanken.";
214+
final String sent2 = "Bek. Problem: Schlafmangel.";
215+
// No space between sentences (useTokenEnd=false supports this)
216+
String sampleSentences = sent1 + sent2;
217+
String[] sents = sentenceDetector.sentDetect(sampleSentences);
218+
double[] probs = sentenceDetector.probs();
219+
assertAll(
220+
() -> assertEquals(2, sents.length),
221+
() -> assertEquals(sent1, sents[0]),
222+
() -> assertEquals(sent2, sents[1]),
223+
() -> assertEquals(2, probs.length));
224+
}
225+
206226
/*
207227
* A reproducer and test for OPENNLP-1767.
208228
* It checks that sentence detection with common abbreviations works correctly,

0 commit comments

Comments
 (0)