Skip to content

Commit eab70aa

Browse files
authored
OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false (#792)
* OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false * Added useTokenEnd to the list of optional params available for sentence detector tool.
1 parent bc51931 commit eab70aa

File tree

4 files changed

+72
-25
lines changed

4 files changed

+72
-25
lines changed

opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ public void run(String format, String[] args) {
9292
try {
9393
Dictionary dict = loadDict(params.getAbbDict());
9494
SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(
95-
params.getFactory(), params.getLang(), true, dict, eos);
95+
params.getFactory(), params.getLang(), params.getUseTokenEnd(), dict, eos);
9696
model = SentenceDetectorME.train(params.getLang(), sampleStream,
9797
sdFactory, mlParams);
9898
} catch (IOException e) {

opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,9 @@ interface TrainingParams extends BasicTrainingParams {
4444
description = "A sub-class of SentenceDetectorFactory where to get implementation and resources.")
4545
@OptionalParameter
4646
String getFactory();
47+
48+
@ParameterDescription(valueName = "useTokenEnd",
49+
description = "A boolean parameter to detect the start index of the next sentence in the test data.")
50+
@OptionalParameter(defaultValue = "true")
51+
Boolean getUseTokenEnd();
4752
}

opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java

Lines changed: 62 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,16 @@
2020
import java.io.IOException;
2121
import java.util.Locale;
2222

23-
import org.junit.jupiter.api.Assertions;
2423
import org.junit.jupiter.api.BeforeAll;
2524
import org.junit.jupiter.api.Test;
2625

2726
import opennlp.tools.dictionary.Dictionary;
2827

28+
import static org.junit.jupiter.api.Assertions.assertAll;
29+
import static org.junit.jupiter.api.Assertions.assertEquals;
30+
import static org.junit.jupiter.api.Assertions.assertNotNull;
31+
import static org.junit.jupiter.api.Assertions.fail;
32+
2933
/**
3034
* Tests for the {@link SentenceDetectorME} class.
3135
* <p>
@@ -42,64 +46,99 @@
4246
public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest {
4347

4448
private static final char[] EOS_CHARS = {'.', '?', '!'};
45-
46-
private static SentenceModel sentdetectModel;
49+
private static Dictionary abbreviationDict;
50+
private SentenceModel sentdetectModel;
4751

4852
@BeforeAll
49-
public static void prepareResources() throws IOException {
50-
Dictionary abbreviationDict = loadAbbDictionary(Locale.GERMAN);
51-
SentenceDetectorFactory factory = new SentenceDetectorFactory(
52-
"deu", true, abbreviationDict, EOS_CHARS);
53-
sentdetectModel = train(factory, Locale.GERMAN);
54-
Assertions.assertNotNull(sentdetectModel);
55-
Assertions.assertEquals("deu", sentdetectModel.getLanguage());
53+
static void loadResources() throws IOException {
54+
abbreviationDict = loadAbbDictionary(Locale.GERMAN);
55+
}
56+
57+
private void prepareResources(boolean useTokenEnd) {
58+
try {
59+
SentenceDetectorFactory factory = new SentenceDetectorFactory(
60+
"deu", useTokenEnd, abbreviationDict, EOS_CHARS);
61+
sentdetectModel = train(factory, Locale.GERMAN);
62+
63+
assertAll(() -> assertNotNull(sentdetectModel),
64+
() -> assertEquals("deu", sentdetectModel.getLanguage()));
65+
} catch (IOException ex) {
66+
fail("Couldn't train the SentenceModel using test data. Exception: " + ex.getMessage());
67+
}
5668
}
5769

5870
// Example taken from 'Sentences_DE.txt'
5971
@Test
6072
void testSentDetectWithInlineAbbreviationsEx1() {
73+
prepareResources(true);
74+
6175
final String sent1 = "Ein Traum, zu dessen Bildung eine besonders starke Verdichtung beigetragen, " +
62-
"wird für diese Untersuchung das günstigste Material sein.";
76+
"wird für diese Untersuchung das günstigste Material sein.";
6377
// Here we have two abbreviations "S. = Seite" and "ff. = folgende (Plural)"
6478
final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";
6579

6680
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
6781
String sampleSentences = sent1 + " " + sent2;
6882
String[] sents = sentDetect.sentDetect(sampleSentences);
69-
Assertions.assertEquals(2, sents.length);
70-
Assertions.assertEquals(sent1, sents[0]);
71-
Assertions.assertEquals(sent2, sents[1]);
7283
double[] probs = sentDetect.getSentenceProbabilities();
73-
Assertions.assertEquals(2, probs.length);
84+
85+
assertAll(() -> assertEquals(2, sents.length),
86+
() -> assertEquals(sent1, sents[0]),
87+
() -> assertEquals(sent2, sents[1]),
88+
() -> assertEquals(2, probs.length));
7489
}
7590

7691
// Reduced example taken from 'Sentences_DE.txt'
7792
@Test
7893
void testSentDetectWithInlineAbbreviationsEx2() {
94+
prepareResources(true);
95+
7996
// Here we have three abbreviations: "S. = Seite", "vgl. = vergleiche", and "f. = folgende (Singular)"
8097
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
81-
"geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
98+
"geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
8299

83100
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
84101
String[] sents = sentDetect.sentDetect(sent1);
85-
Assertions.assertEquals(1, sents.length);
86-
Assertions.assertEquals(sent1, sents[0]);
87102
double[] probs = sentDetect.getSentenceProbabilities();
88-
Assertions.assertEquals(1, probs.length);
103+
104+
assertAll(() -> assertEquals(1, sents.length),
105+
() -> assertEquals(sent1, sents[0]),
106+
() -> assertEquals(1, probs.length));
89107
}
90108

91109
// Modified example deduced from 'Sentences_DE.txt'
92110
@Test
93111
void testSentDetectWithInlineAbbreviationsEx3() {
112+
prepareResources(true);
113+
94114
// Here we have two abbreviations "z. B. = zum Beispiel" and "S. = Seite"
95115
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
96-
"geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
116+
"geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
97117

98118
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
99119
String[] sents = sentDetect.sentDetect(sent1);
100-
Assertions.assertEquals(1, sents.length);
101-
Assertions.assertEquals(sent1, sents[0]);
102120
double[] probs = sentDetect.getSentenceProbabilities();
103-
Assertions.assertEquals(1, probs.length);
121+
122+
assertAll(() -> assertEquals(1, sents.length),
123+
() -> assertEquals(sent1, sents[0]),
124+
() -> assertEquals(1, probs.length));
125+
}
126+
127+
@Test
128+
void testSentDetectWithUseTokenEndFalse() {
129+
prepareResources(false);
130+
131+
final String sent1 = "Träume sind eine Verbindung von Gedanken.";
132+
final String sent2 = "Verschiedene Gedanken sind während der Traumformation aktiv.";
133+
134+
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
135+
//There is no blank space before start of the second sentence.
136+
String[] sents = sentDetect.sentDetect(sent1 + sent2);
137+
double[] probs = sentDetect.getSentenceProbabilities();
138+
139+
assertAll(() -> assertEquals(2, sents.length),
140+
() -> assertEquals(sent1, sents[0]),
141+
() -> assertEquals(sent2, sents[1]),
142+
() -> assertEquals(2, probs.length));
104143
}
105144
}

opennlp-docs/src/docbkx/sentdetect.xml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,10 @@ Arguments description:
142142
-data sampleData
143143
data to be used, usually a file name.
144144
-encoding charsetName
145-
encoding for reading and writing text, if absent the system default is used.]]>
145+
encoding for reading and writing text, if absent the system default is used.
146+
-useTokenEnd boolean flag
147+
set to false when the next sentence in the test dataset doesn't start with a blank space post completion of
148+
the previous sentence. If absent, it is defaulted to true.]]>
146149
</screen>
147150
To train an English sentence detector use the following command:
148151
<screen>

0 commit comments

Comments
 (0)