diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java index 933895bfe..77d09baf9 100644 --- a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java +++ b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java @@ -92,7 +92,7 @@ public void run(String format, String[] args) { try { Dictionary dict = loadDict(params.getAbbDict()); SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create( - params.getFactory(), params.getLang(), true, dict, eos); + params.getFactory(), params.getLang(), params.getUseTokenEnd(), dict, eos); model = SentenceDetectorME.train(params.getLang(), sampleStream, sdFactory, mlParams); } catch (IOException e) { diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java index 476f929ad..37cb71159 100644 --- a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java +++ b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java @@ -44,4 +44,9 @@ interface TrainingParams extends BasicTrainingParams { description = "A sub-class of SentenceDetectorFactory where to get implementation and resources.") @OptionalParameter String getFactory(); + + @ParameterDescription(valueName = "useTokenEnd", + description = "A boolean parameter to detect the start index of the next sentence in the test data.") + @OptionalParameter(defaultValue = "true") + Boolean getUseTokenEnd(); } diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java index a520ed27e..7593100af 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java @@ -20,12 +20,16 @@ import java.io.IOException; import java.util.Locale; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import opennlp.tools.dictionary.Dictionary; +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.fail; + /** * Tests for the {@link SentenceDetectorME} class. *

@@ -42,64 +46,99 @@ public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest { private static final char[] EOS_CHARS = {'.', '?', '!'}; - - private static SentenceModel sentdetectModel; + private static Dictionary abbreviationDict; + private SentenceModel sentdetectModel; @BeforeAll - public static void prepareResources() throws IOException { - Dictionary abbreviationDict = loadAbbDictionary(Locale.GERMAN); - SentenceDetectorFactory factory = new SentenceDetectorFactory( - "deu", true, abbreviationDict, EOS_CHARS); - sentdetectModel = train(factory, Locale.GERMAN); - Assertions.assertNotNull(sentdetectModel); - Assertions.assertEquals("deu", sentdetectModel.getLanguage()); + static void loadResources() throws IOException { + abbreviationDict = loadAbbDictionary(Locale.GERMAN); + } + + private void prepareResources(boolean useTokenEnd) { + try { + SentenceDetectorFactory factory = new SentenceDetectorFactory( + "deu", useTokenEnd, abbreviationDict, EOS_CHARS); + sentdetectModel = train(factory, Locale.GERMAN); + + assertAll(() -> assertNotNull(sentdetectModel), + () -> assertEquals("deu", sentdetectModel.getLanguage())); + } catch (IOException ex) { + fail("Couldn't train the SentenceModel using test data. Exception: " + ex.getMessage()); + } } // Example taken from 'Sentences_DE.txt' @Test void testSentDetectWithInlineAbbreviationsEx1() { + prepareResources(true); + final String sent1 = "Ein Traum, zu dessen Bildung eine besonders starke Verdichtung beigetragen, " + - "wird für diese Untersuchung das günstigste Material sein."; + "wird für diese Untersuchung das günstigste Material sein."; // Here we have two abbreviations "S. = Seite" and "ff. = folgende (Plural)" final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie."; SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel); String sampleSentences = sent1 + " " + sent2; String[] sents = sentDetect.sentDetect(sampleSentences); - Assertions.assertEquals(2, sents.length); - Assertions.assertEquals(sent1, sents[0]); - Assertions.assertEquals(sent2, sents[1]); double[] probs = sentDetect.getSentenceProbabilities(); - Assertions.assertEquals(2, probs.length); + + assertAll(() -> assertEquals(2, sents.length), + () -> assertEquals(sent1, sents[0]), + () -> assertEquals(sent2, sents[1]), + () -> assertEquals(2, probs.length)); } // Reduced example taken from 'Sentences_DE.txt' @Test void testSentDetectWithInlineAbbreviationsEx2() { + prepareResources(true); + // Here we have three abbreviations: "S. = Seite", "vgl. = vergleiche", and "f. = folgende (Singular)" final String sent1 = "Die farbige Tafel, die ich aufschlage, " + - "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein."; + "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein."; SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel); String[] sents = sentDetect.sentDetect(sent1); - Assertions.assertEquals(1, sents.length); - Assertions.assertEquals(sent1, sents[0]); double[] probs = sentDetect.getSentenceProbabilities(); - Assertions.assertEquals(1, probs.length); + + assertAll(() -> assertEquals(1, sents.length), + () -> assertEquals(sent1, sents[0]), + () -> assertEquals(1, probs.length)); } // Modified example deduced from 'Sentences_DE.txt' @Test void testSentDetectWithInlineAbbreviationsEx3() { + prepareResources(true); + // Here we have two abbreviations "z. B. = zum Beispiel" and "S. = Seite" final String sent1 = "Die farbige Tafel, die ich aufschlage, " + - "geht (z. B. die Analyse S. 185) auf ein neues Thema ein."; + "geht (z. B. die Analyse S. 185) auf ein neues Thema ein."; SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel); String[] sents = sentDetect.sentDetect(sent1); - Assertions.assertEquals(1, sents.length); - Assertions.assertEquals(sent1, sents[0]); double[] probs = sentDetect.getSentenceProbabilities(); - Assertions.assertEquals(1, probs.length); + + assertAll(() -> assertEquals(1, sents.length), + () -> assertEquals(sent1, sents[0]), + () -> assertEquals(1, probs.length)); + } + + @Test + void testSentDetectWithUseTokenEndFalse() { + prepareResources(false); + + final String sent1 = "Träume sind eine Verbindung von Gedanken."; + final String sent2 = "Verschiedene Gedanken sind während der Traumformation aktiv."; + + SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel); + //There is no blank space before start of the second sentence. + String[] sents = sentDetect.sentDetect(sent1 + sent2); + double[] probs = sentDetect.getSentenceProbabilities(); + + assertAll(() -> assertEquals(2, sents.length), + () -> assertEquals(sent1, sents[0]), + () -> assertEquals(sent2, sents[1]), + () -> assertEquals(2, probs.length)); } } diff --git a/opennlp-docs/src/docbkx/sentdetect.xml b/opennlp-docs/src/docbkx/sentdetect.xml index 11b047d31..f73248ec1 100644 --- a/opennlp-docs/src/docbkx/sentdetect.xml +++ b/opennlp-docs/src/docbkx/sentdetect.xml @@ -142,7 +142,10 @@ Arguments description: -data sampleData data to be used, usually a file name. -encoding charsetName - encoding for reading and writing text, if absent the system default is used.]]> + encoding for reading and writing text, if absent the system default is used. + -useTokenEnd boolean flag + set to false when the next sentence in the test dataset doesn't start with a blank space post completion of + the previous sentence. If absent, it is defaulted to true.]]> To train an English sentence detector use the following command: