Skip to content

Commit 235f503

Browse files
committed
- adds three tagged sentences for the Polish language, tagged and community provided by: @alsmolarczyk, native speaker of the Polish language
1 parent 9470034 commit 235f503

File tree

1 file changed

+56
-17
lines changed

1 file changed

+56
-17
lines changed

opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,35 +19,53 @@
1919

2020
import java.io.IOException;
2121
import java.util.ArrayList;
22+
import java.util.HashMap;
2223
import java.util.List;
24+
import java.util.Map;
2325
import java.util.stream.Stream;
2426

27+
import org.junit.jupiter.api.BeforeAll;
2528
import org.junit.jupiter.params.ParameterizedTest;
2629
import org.junit.jupiter.params.provider.Arguments;
2730
import org.junit.jupiter.params.provider.MethodSource;
2831

32+
import opennlp.tools.tokenize.ThreadSafeTokenizerME;
2933
import opennlp.tools.tokenize.Tokenizer;
30-
import opennlp.tools.tokenize.TokenizerME;
3134

3235
import static org.junit.jupiter.api.Assertions.assertEquals;
3336
import static org.junit.jupiter.api.Assertions.assertNotNull;
3437
import static org.junit.jupiter.api.Assertions.assertTrue;
3538

3639
public class POSTaggerMEIT {
3740

41+
private static final String CATALAN = "ca";
42+
private static final String ENGLISH = "en";
43+
private static final String GERMAN = "de";
44+
private static final String POLISH = "pl";
45+
private static final String PORTUGUESE = "pt";
46+
47+
private static final Map<String, Tokenizer> TOKENIZERS = new HashMap<>();
48+
private static final Map<String, POSTagger> TAGGERS = new HashMap<>();
49+
3850
private static final boolean debug = false;
3951

52+
@BeforeAll
53+
public static void initResources() throws IOException {
54+
List<String> langs = List.of(CATALAN, ENGLISH, GERMAN, POLISH, PORTUGUESE);
55+
for (String langCode: langs) {
56+
TOKENIZERS.put(langCode, new ThreadSafeTokenizerME(langCode));
57+
TAGGERS.put(langCode, new ThreadSafePOSTaggerME(langCode));
58+
}
59+
}
60+
4061
@ParameterizedTest(name = "Verify \"{0}\" sample")
4162
@MethodSource(value = "provideData")
42-
void testPOSTagger(String langCode, String input, String[] expectedTags) throws IOException {
43-
44-
Tokenizer tokenizer = new TokenizerME(langCode);
45-
POSTagger tagger = new POSTaggerME(langCode);
63+
void testPOSTagger(String langCode, int allowedDelta, String input, String[] expectedTags) {
4664

47-
String[] tokens = tokenizer.tokenize(input);
65+
final String[] tokens = TOKENIZERS.get(langCode).tokenize(input);
4866
assertNotNull(tokens);
4967
assertEquals(expectedTags.length, tokens.length);
50-
String[] tags = tagger.tag(tokens);
68+
final String[] tags = TAGGERS.get(langCode).tag(tokens);
5169
assertNotNull(tags);
5270
assertEquals(expectedTags.length, tags.length);
5371
StringBuilder fullyTagged = new StringBuilder();
@@ -71,24 +89,23 @@ void testPOSTagger(String langCode, String input, String[] expectedTags) throws
7189
if (debug) {
7290
System.out.println(sb);
7391
}
74-
// assertEquals(expectedTags[i], tags[i]);
7592
}
76-
assertTrue(incorrectTagsPositions.size() <= 1);
93+
assertTrue(incorrectTagsPositions.size() <= allowedDelta);
7794
}
7895

7996
private static Stream<Arguments> provideData() {
8097
return Stream.of(
8198
// see: Dev Manual
82-
Arguments.of("en",
99+
Arguments.of(ENGLISH, 0,
83100
"Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group .",
84101
new String[]{"PROPN", "PROPN", "AUX", "NOUN", "ADP", "ADJ", "PROPN", "PUNCT", "DET", "PROPN",
85102
"VERB", "NOUN", "PUNCT"}),
86103
// see: 'de-ud-train-sample.conllu'
87-
Arguments.of("de",
104+
Arguments.of(GERMAN, 0,
88105
"Fachlich kompetent, sehr gute Beratung und ein freundliches Team .",
89-
new String[]{"ADV", "ADJ", "PUNCT", "ADV", "ADJ", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "PUNCT"}),
106+
new String[]{"ADJ", "ADJ", "PUNCT", "ADV", "ADJ", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "PUNCT"}),
90107
// see: 'pt-br-ud-sample.conllu'
91-
Arguments.of("pt",
108+
Arguments.of(PORTUGUESE, 1,
92109
"Numa reunião entre representantes da Secretaria da Criança do DF ea juíza da Vara de Execuções de " +
93110
"Medidas Socioeducativas, Lavínia Tupi Vieira Fonseca, ficou acordado que dos 25 internos, " +
94111
"12 serão internados na Unidade de Planaltina e os outros 13 devem retornar para a Unidade do " +
@@ -100,9 +117,32 @@ private static Stream<Arguments> provideData() {
100117
"PUNCT", "NUM", "AUX", "VERB", "ADP+DET", "PROPN", "ADP", "PROPN", "CCONJ", "DET", "DET", "NUM",
101118
"AUX", "VERB", "ADP", "DET", "PROPN", "ADP+DET", "PROPN", "ADP+DET", "PROPN", "PUNCT", "ADJ",
102119
"PROPN", "PUNCT"}),
103-
// see: @kinow
104-
Arguments.of("ca",
105-
"Un gran embossament d'aire fred es comença a despenjar cap al centre d'Europa.",
120+
// via @alsmolarczyk, original by Lem, Stanisław (1961/2022):
121+
// Solaris, Wydawnictwo Literackie, Kraków, S. 81.
122+
Arguments.of(POLISH, 1,
123+
"Zerwałem się ze stosu zwiniętych spadochronów i pobiegłem prosto do radiostacji .",
124+
new String[]{"VERB+AUX", "PART", "ADP", "NOUN", "ADJ", "NOUN", "CCONJ", "VERB+AUX", "ADV", "ADP",
125+
"NOUN", "PUNCT"}),
126+
// via @alsmolarczyk, original by Tokarczuk, Olga (2009/2021):
127+
// Prowadź swój pług przez kości umarłych, Wydawnictwo Literackie, Kraków, S. 43-44.
128+
// TODO ask @alsmolarczyk for POS-tagger output:
129+
// Więzienie_NOUN nie_PART tkwi_VERB na_ADP zewnątrz_ADV ,_PUNCT
130+
// ale_CCONJ jest_VERB w_ADP środku_NOUN każdego_DET z_ADP nas_PRON ._PUNCT
131+
Arguments.of(POLISH, 2,
132+
"Więzienie nie tkwi na zewnątrz, ale jest w środku każdego z nas .",
133+
new String[]{"NOUN", "PART", "VERB", "ADP", "NOUN", "PUNCT", "CCONJ", "VERB", "ADP", "NOUN",
134+
"PRON", "ADP", "PRON", "PUNCT"}),
135+
// via @alsmolarczyk, original by Zalega, Dariusz (2019):
136+
// Śląsk zbuntowany, Wydawnictwo Czarne, Wołowiec, S. 96.
137+
Arguments.of(POLISH, 0,
138+
"Działacze stosowali też różne formy nacisku na polski konsulat , żeby zaopiekował się " +
139+
"bezrobotnymi z Polski albo dał im choćby na bezpłatny bilet do kraju .",
140+
new String[]{"NOUN", "VERB", "PART", "ADJ", "NOUN", "NOUN", "ADP", "ADJ", "NOUN", "PUNCT", "SCONJ",
141+
"VERB", "PRON", "ADJ", "ADP", "PROPN", "CCONJ", "VERB", "PRON", "PART", "ADP", "ADJ", "NOUN",
142+
"ADP", "NOUN", "PUNCT"}),
143+
// via: @kinow
144+
Arguments.of(CATALAN, 1,
145+
"Un gran embossament d'aire fred es comença a despenjar cap al centre d'Europa.",
106146
// OpenNLP, different at: idx pos 2, 3, 5, and 13(+14) -> however, only pos 5 is "wrong" (ref)
107147
new String[]{"DET", "ADJ", "NOUN", "ADP", "NOUN", "ADJ", "PRON", "VERB", "ADP", "VERB", "NOUN",
108148
"ADP+DET", "NOUN", "ADP", "PROPN", "PUNCT"})
@@ -115,7 +155,6 @@ private static Stream<Arguments> provideData() {
115155
// "NOUN", "PROPN", "PROPN", "PUNCT"
116156
// ok! , ok! , ??? , ??? , ok! , ok! , ok! , ok! , ok! , ok! , ok! , ok! + ok! ,
117157
// ok! , ??? , ok! , ok!
118-
119158
);
120159
}
121160
}

0 commit comments

Comments
 (0)