Skip to content

Commit 5b690d7

Browse files
committed
OPENNLP-1809: SentenceDetector misses multi-letter abbreviations at sentence start (#983)
- adds reproducer & test - fixes the issue in SentenceDetectorME#isAcceptableBreak(..) - refactors some code in other spots (cherry picked from commit d780617)
1 parent f6e5397 commit 5b690d7

File tree

5 files changed

+53
-25
lines changed

5 files changed

+53
-25
lines changed

opennlp-tools/src/main/java/opennlp/tools/dictionary/serializer/DictionaryEntryPersistor.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ public void startPrefixMapping(String prefix, String uri)
226226
public static boolean create(InputStream in, EntryInserter inserter)
227227
throws IOException {
228228

229-
DictionaryContenthandler profileContentHandler = new DictionaryContenthandler(inserter);
229+
DictionaryContenthandler handler = new DictionaryContenthandler(inserter);
230230

231231
XMLReader xmlReader;
232232
try {
@@ -235,14 +235,14 @@ public static boolean create(InputStream in, EntryInserter inserter)
235235
// There is a compatibility problem here: JAXP default is false while SAX 2 default is true!
236236
// OpenNLP requires it activated!
237237
xmlReader.setFeature(SAX_FEATURE_NAMESPACES, true);
238-
xmlReader.setContentHandler(profileContentHandler);
238+
xmlReader.setContentHandler(handler);
239239
xmlReader.parse(new InputSource(new UncloseableInputStream(in)));
240240
}
241241
catch (ParserConfigurationException | SAXException e) {
242242
throw new InvalidFormatException("The profile data stream has " +
243243
"an invalid format!", e);
244244
}
245-
return profileContentHandler.mIsCaseSensitiveDictionary;
245+
return handler.mIsCaseSensitiveDictionary;
246246
}
247247

248248
/**

opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -339,27 +339,31 @@ protected boolean isAcceptableBreak(CharSequence s, int fromIndex, int candidate
339339
if (abbDict == null)
340340
return true;
341341

342+
final String text = s.toString();
342343
for (StringList abb : abbDict) {
343-
final String token = abb.getToken(0);
344-
final int tokenPosition = s.toString().indexOf(token, fromIndex);
345-
if (tokenPosition == -1) {
346-
continue; // skip fast
344+
final String abbToken = abb.getToken(0);
345+
final int tokenStartPos = text.indexOf(abbToken, fromIndex);
346+
if (tokenStartPos == -1) {
347+
continue; // skip fast when abb is not present in text
347348
}
348-
349-
final char prevChar = s.charAt(tokenPosition == 0 ? tokenPosition : tokenPosition - 1);
350-
int tokenLength = token.length();
351-
if (tokenPosition + tokenLength < candidateIndex || tokenPosition > candidateIndex ||
352-
/*
353-
* Note:
354-
* Skip abbreviation candidate if regular characters exist directly before it,
355-
* That is, any letter or digit except: a whitespace, an apostrophe, or an opening round bracket.
356-
* This prevents mismatches from overlaps close to an actual sentence end.
357-
*/
358-
!(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) {
359-
360-
continue;
349+
if (tokenStartPos == 0 && text.substring(tokenStartPos, candidateIndex + 1).equals(abbToken)) {
350+
return false; // full abbreviation match at sentence start -> no acceptable break
351+
} else {
352+
final int tokenLength = abbToken.length();
353+
final char prevChar = s.charAt(tokenStartPos == 0 ? tokenStartPos : tokenStartPos - 1);
354+
if (tokenStartPos + tokenLength < candidateIndex || tokenStartPos > candidateIndex ||
355+
/*
356+
* Note:
357+
* Skip abbreviation candidate if regular characters exist directly before it,
358+
* That is, any letter or digit except: a whitespace, an apostrophe, or an opening round bracket.
359+
* This prevents mismatches from overlaps close to an actual sentence end.
360+
*/
361+
!(Character.isWhitespace(prevChar) || isApostrophe(prevChar) || prevChar == '(')) {
362+
363+
continue;
364+
}
365+
return false; // in case of a valid abbreviation: the (sentence) break is not accepted
361366
}
362-
return false; // in case of a valid abbreviation: the (sentence) break is not accepted
363367
}
364368
return true; // no abbreviation(s) at given positions: valid sentence boundary
365369
}

opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,10 @@ void testSentDetectWithUseTokenEndFalse() {
152152
}
153153

154154
/*
155-
* A reproducer and test for OPENNLP-1781.
155+
* A reproducer and test for OPENNLP-1781.
156156
*/
157157
@Test
158-
void testSentDetectWithAbbreviationsAtSentenceStart() {
158+
void testSentDetectWithSingleLetterAbbreviationsAtSentenceStart() {
159159
prepareResources(true);
160160

161161
final String sent1 = "S. Träume sind eine Verbindung von Gedanken.";
@@ -169,6 +169,27 @@ void testSentDetectWithAbbreviationsAtSentenceStart() {
169169
() -> assertEquals(1, probs.length));
170170
}
171171

172+
/*
173+
* A reproducer and test for OPENNLP-1809.
174+
*/
175+
@Test
176+
void testSentDetectWithMultiLetterAbbreviationsAtSentenceStart() {
177+
prepareResources(true);
178+
179+
final String sent1 = "Bek. Problem: Schlafmangel.";
180+
final String sent2 = "Über die letzten Tage hinweg war sie zunehmend müde.";
181+
182+
String sampleSentences = sent1 + " " + sent2;
183+
String[] sents = sentenceDetector.sentDetect(sampleSentences);
184+
double[] probs = sentenceDetector.probs();
185+
186+
assertAll(
187+
() -> assertEquals(2, sents.length),
188+
() -> assertEquals(sent1, sents[0]),
189+
() -> assertEquals(sent2, sents[1]),
190+
() -> assertEquals(2, probs.length));
191+
}
192+
172193
/*
173194
* A reproducer and test for OPENNLP-1767.
174195
* It checks that sentence detection with common abbreviations works correctly,

opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,8 @@ void checkCustomPatternForTokenizerME(String lang, String pattern, String senten
210210

211211
Assertions.assertEquals(expectedNumTokens, tokens.length);
212212
String[] sentSplit = sentence
213-
.replaceAll("'", " '")
214-
.replaceAll(",", " ,")
213+
.replace("'", " '")
214+
.replace(",", " ,")
215215
.split(" ");
216216
for (int i = 0; i < sentSplit.length; i++) {
217217
String sElement = sentSplit[i];

opennlp-tools/src/test/resources/opennlp/tools/lang/abb_DE.xml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,7 @@
4747
<entry>
4848
<token>z.B.</token>
4949
</entry>
50+
<entry>
51+
<token>Bek.</token>
52+
</entry>
5053
</dictionary>

0 commit comments

Comments
 (0)