diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java index 933895bfe..77d09baf9 100644 --- a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java +++ b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java @@ -92,7 +92,7 @@ public void run(String format, String[] args) { try { Dictionary dict = loadDict(params.getAbbDict()); SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create( - params.getFactory(), params.getLang(), true, dict, eos); + params.getFactory(), params.getLang(), params.getUseTokenEnd(), dict, eos); model = SentenceDetectorME.train(params.getLang(), sampleStream, sdFactory, mlParams); } catch (IOException e) { diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java index 476f929ad..37cb71159 100644 --- a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java +++ b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java @@ -44,4 +44,9 @@ interface TrainingParams extends BasicTrainingParams { description = "A sub-class of SentenceDetectorFactory where to get implementation and resources.") @OptionalParameter String getFactory(); + + @ParameterDescription(valueName = "useTokenEnd", + description = "A boolean parameter to detect the start index of the next sentence in the test data.") + @OptionalParameter(defaultValue = "true") + Boolean getUseTokenEnd(); } diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java index a520ed27e..7593100af 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java @@ -20,12 +20,16 @@ import java.io.IOException; import java.util.Locale; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import opennlp.tools.dictionary.Dictionary; +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.fail; + /** * Tests for the {@link SentenceDetectorME} class. *
@@ -42,64 +46,99 @@
public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest {
private static final char[] EOS_CHARS = {'.', '?', '!'};
-
- private static SentenceModel sentdetectModel;
+ private static Dictionary abbreviationDict;
+ private SentenceModel sentdetectModel;
@BeforeAll
- public static void prepareResources() throws IOException {
- Dictionary abbreviationDict = loadAbbDictionary(Locale.GERMAN);
- SentenceDetectorFactory factory = new SentenceDetectorFactory(
- "deu", true, abbreviationDict, EOS_CHARS);
- sentdetectModel = train(factory, Locale.GERMAN);
- Assertions.assertNotNull(sentdetectModel);
- Assertions.assertEquals("deu", sentdetectModel.getLanguage());
+ static void loadResources() throws IOException {
+ abbreviationDict = loadAbbDictionary(Locale.GERMAN);
+ }
+
+ private void prepareResources(boolean useTokenEnd) {
+ try {
+ SentenceDetectorFactory factory = new SentenceDetectorFactory(
+ "deu", useTokenEnd, abbreviationDict, EOS_CHARS);
+ sentdetectModel = train(factory, Locale.GERMAN);
+
+ assertAll(() -> assertNotNull(sentdetectModel),
+ () -> assertEquals("deu", sentdetectModel.getLanguage()));
+ } catch (IOException ex) {
+ fail("Couldn't train the SentenceModel using test data. Exception: " + ex.getMessage());
+ }
}
// Example taken from 'Sentences_DE.txt'
@Test
void testSentDetectWithInlineAbbreviationsEx1() {
+ prepareResources(true);
+
final String sent1 = "Ein Traum, zu dessen Bildung eine besonders starke Verdichtung beigetragen, " +
- "wird für diese Untersuchung das günstigste Material sein.";
+ "wird für diese Untersuchung das günstigste Material sein.";
// Here we have two abbreviations "S. = Seite" and "ff. = folgende (Plural)"
final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String sampleSentences = sent1 + " " + sent2;
String[] sents = sentDetect.sentDetect(sampleSentences);
- Assertions.assertEquals(2, sents.length);
- Assertions.assertEquals(sent1, sents[0]);
- Assertions.assertEquals(sent2, sents[1]);
double[] probs = sentDetect.getSentenceProbabilities();
- Assertions.assertEquals(2, probs.length);
+
+ assertAll(() -> assertEquals(2, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(sent2, sents[1]),
+ () -> assertEquals(2, probs.length));
}
// Reduced example taken from 'Sentences_DE.txt'
@Test
void testSentDetectWithInlineAbbreviationsEx2() {
+ prepareResources(true);
+
// Here we have three abbreviations: "S. = Seite", "vgl. = vergleiche", and "f. = folgende (Singular)"
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
- "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
+ "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String[] sents = sentDetect.sentDetect(sent1);
- Assertions.assertEquals(1, sents.length);
- Assertions.assertEquals(sent1, sents[0]);
double[] probs = sentDetect.getSentenceProbabilities();
- Assertions.assertEquals(1, probs.length);
+
+ assertAll(() -> assertEquals(1, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(1, probs.length));
}
// Modified example deduced from 'Sentences_DE.txt'
@Test
void testSentDetectWithInlineAbbreviationsEx3() {
+ prepareResources(true);
+
// Here we have two abbreviations "z. B. = zum Beispiel" and "S. = Seite"
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
- "geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
+ "geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String[] sents = sentDetect.sentDetect(sent1);
- Assertions.assertEquals(1, sents.length);
- Assertions.assertEquals(sent1, sents[0]);
double[] probs = sentDetect.getSentenceProbabilities();
- Assertions.assertEquals(1, probs.length);
+
+ assertAll(() -> assertEquals(1, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(1, probs.length));
+ }
+
+ @Test
+ void testSentDetectWithUseTokenEndFalse() {
+ prepareResources(false);
+
+ final String sent1 = "Träume sind eine Verbindung von Gedanken.";
+ final String sent2 = "Verschiedene Gedanken sind während der Traumformation aktiv.";
+
+ SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
+ //There is no blank space before start of the second sentence.
+ String[] sents = sentDetect.sentDetect(sent1 + sent2);
+ double[] probs = sentDetect.getSentenceProbabilities();
+
+ assertAll(() -> assertEquals(2, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(sent2, sents[1]),
+ () -> assertEquals(2, probs.length));
}
}
diff --git a/opennlp-docs/src/docbkx/sentdetect.xml b/opennlp-docs/src/docbkx/sentdetect.xml
index 11b047d31..f73248ec1 100644
--- a/opennlp-docs/src/docbkx/sentdetect.xml
+++ b/opennlp-docs/src/docbkx/sentdetect.xml
@@ -142,7 +142,10 @@ Arguments description:
-data sampleData
data to be used, usually a file name.
-encoding charsetName
- encoding for reading and writing text, if absent the system default is used.]]>
+ encoding for reading and writing text, if absent the system default is used.
+ -useTokenEnd boolean flag
+ set to false when the next sentence in the test dataset doesn't start with a blank space post completion of
+ the previous sentence. If absent, it is defaulted to true.]]>
To train an English sentence detector use the following command: