Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ public void run(String format, String[] args) {
try {
Dictionary dict = loadDict(params.getAbbDict());
SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(
params.getFactory(), params.getLang(), true, dict, eos);
params.getFactory(), params.getLang(), params.getUseTokenEnd(), dict, eos);
model = SentenceDetectorME.train(params.getLang(), sampleStream,
sdFactory, mlParams);
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,9 @@ interface TrainingParams extends BasicTrainingParams {
description = "A sub-class of SentenceDetectorFactory where to get implementation and resources.")
@OptionalParameter
String getFactory();

@ParameterDescription(valueName = "useTokenEnd",
description = "A boolean parameter to detect the start index of the next sentence in the test data.")
@OptionalParameter(defaultValue = "true")
Boolean getUseTokenEnd();
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,16 @@
import java.io.IOException;
import java.util.Locale;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;

import opennlp.tools.dictionary.Dictionary;

import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.fail;

/**
* Tests for the {@link SentenceDetectorME} class.
* <p>
Expand All @@ -42,64 +46,99 @@
public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest {

private static final char[] EOS_CHARS = {'.', '?', '!'};

private static SentenceModel sentdetectModel;
private static Dictionary abbreviationDict;
private SentenceModel sentdetectModel;

@BeforeAll
public static void prepareResources() throws IOException {
Dictionary abbreviationDict = loadAbbDictionary(Locale.GERMAN);
SentenceDetectorFactory factory = new SentenceDetectorFactory(
"deu", true, abbreviationDict, EOS_CHARS);
sentdetectModel = train(factory, Locale.GERMAN);
Assertions.assertNotNull(sentdetectModel);
Assertions.assertEquals("deu", sentdetectModel.getLanguage());
static void loadResources() throws IOException {
abbreviationDict = loadAbbDictionary(Locale.GERMAN);
}

private void prepareResources(boolean useTokenEnd) {
try {
SentenceDetectorFactory factory = new SentenceDetectorFactory(
"deu", useTokenEnd, abbreviationDict, EOS_CHARS);
sentdetectModel = train(factory, Locale.GERMAN);

assertAll(() -> assertNotNull(sentdetectModel),
() -> assertEquals("deu", sentdetectModel.getLanguage()));
} catch (IOException ex) {
fail("Couldn't train the SentenceModel using test data. Exception: " + ex.getMessage());
}
}

// Example taken from 'Sentences_DE.txt'
@Test
void testSentDetectWithInlineAbbreviationsEx1() {
prepareResources(true);

final String sent1 = "Ein Traum, zu dessen Bildung eine besonders starke Verdichtung beigetragen, " +
"wird für diese Untersuchung das günstigste Material sein.";
"wird für diese Untersuchung das günstigste Material sein.";
// Here we have two abbreviations "S. = Seite" and "ff. = folgende (Plural)"
final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";

SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String sampleSentences = sent1 + " " + sent2;
String[] sents = sentDetect.sentDetect(sampleSentences);
Assertions.assertEquals(2, sents.length);
Assertions.assertEquals(sent1, sents[0]);
Assertions.assertEquals(sent2, sents[1]);
double[] probs = sentDetect.getSentenceProbabilities();
Assertions.assertEquals(2, probs.length);

assertAll(() -> assertEquals(2, sents.length),
() -> assertEquals(sent1, sents[0]),
() -> assertEquals(sent2, sents[1]),
() -> assertEquals(2, probs.length));
}

// Reduced example taken from 'Sentences_DE.txt'
@Test
void testSentDetectWithInlineAbbreviationsEx2() {
prepareResources(true);

// Here we have three abbreviations: "S. = Seite", "vgl. = vergleiche", and "f. = folgende (Singular)"
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
"geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
"geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";

SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String[] sents = sentDetect.sentDetect(sent1);
Assertions.assertEquals(1, sents.length);
Assertions.assertEquals(sent1, sents[0]);
double[] probs = sentDetect.getSentenceProbabilities();
Assertions.assertEquals(1, probs.length);

assertAll(() -> assertEquals(1, sents.length),
() -> assertEquals(sent1, sents[0]),
() -> assertEquals(1, probs.length));
}

// Modified example deduced from 'Sentences_DE.txt'
@Test
void testSentDetectWithInlineAbbreviationsEx3() {
prepareResources(true);

// Here we have two abbreviations "z. B. = zum Beispiel" and "S. = Seite"
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
"geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
"geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";

SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String[] sents = sentDetect.sentDetect(sent1);
Assertions.assertEquals(1, sents.length);
Assertions.assertEquals(sent1, sents[0]);
double[] probs = sentDetect.getSentenceProbabilities();
Assertions.assertEquals(1, probs.length);

assertAll(() -> assertEquals(1, sents.length),
() -> assertEquals(sent1, sents[0]),
() -> assertEquals(1, probs.length));
}

@Test
void testSentDetectWithUseTokenEndFalse() {
prepareResources(false);

final String sent1 = "Träume sind eine Verbindung von Gedanken.";
final String sent2 = "Verschiedene Gedanken sind während der Traumformation aktiv.";

SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
//There is no blank space before start of the second sentence.
String[] sents = sentDetect.sentDetect(sent1 + sent2);
double[] probs = sentDetect.getSentenceProbabilities();

assertAll(() -> assertEquals(2, sents.length),
() -> assertEquals(sent1, sents[0]),
() -> assertEquals(sent2, sents[1]),
() -> assertEquals(2, probs.length));
}
}
5 changes: 4 additions & 1 deletion opennlp-docs/src/docbkx/sentdetect.xml
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,10 @@ Arguments description:
-data sampleData
data to be used, usually a file name.
-encoding charsetName
encoding for reading and writing text, if absent the system default is used.]]>
encoding for reading and writing text, if absent the system default is used.
-useTokenEnd boolean flag
set to false when the next sentence in the test dataset doesn't start with a blank space post completion of
the previous sentence. If absent, it is defaulted to true.]]>
</screen>
To train an English sentence detector use the following command:
<screen>
Expand Down