1919
2020import java .io .IOException ;
2121import java .util .ArrayList ;
22+ import java .util .HashMap ;
2223import java .util .List ;
24+ import java .util .Map ;
2325import java .util .stream .Stream ;
2426
27+ import org .junit .jupiter .api .BeforeAll ;
2528import org .junit .jupiter .params .ParameterizedTest ;
2629import org .junit .jupiter .params .provider .Arguments ;
2730import org .junit .jupiter .params .provider .MethodSource ;
2831
32+ import opennlp .tools .tokenize .ThreadSafeTokenizerME ;
2933import opennlp .tools .tokenize .Tokenizer ;
30- import opennlp .tools .tokenize .TokenizerME ;
3134
3235import static org .junit .jupiter .api .Assertions .assertEquals ;
3336import static org .junit .jupiter .api .Assertions .assertNotNull ;
3437import static org .junit .jupiter .api .Assertions .assertTrue ;
3538
3639public class POSTaggerMEIT {
3740
41+ private static final String CATALAN = "ca" ;
42+ private static final String ENGLISH = "en" ;
43+ private static final String GERMAN = "de" ;
44+ private static final String POLISH = "pl" ;
45+ private static final String PORTUGUESE = "pt" ;
46+
47+ private static final Map <String , Tokenizer > TOKENIZERS = new HashMap <>();
48+ private static final Map <String , POSTagger > TAGGERS = new HashMap <>();
49+
3850 private static final boolean debug = false ;
3951
52+ @ BeforeAll
53+ public static void initResources () throws IOException {
54+ List <String > langs = List .of (CATALAN , ENGLISH , GERMAN , POLISH , PORTUGUESE );
55+ for (String langCode : langs ) {
56+ TOKENIZERS .put (langCode , new ThreadSafeTokenizerME (langCode ));
57+ TAGGERS .put (langCode , new ThreadSafePOSTaggerME (langCode ));
58+ }
59+ }
60+
4061 @ ParameterizedTest (name = "Verify \" {0}\" sample" )
4162 @ MethodSource (value = "provideData" )
42- void testPOSTagger (String langCode , String input , String [] expectedTags ) throws IOException {
43-
44- Tokenizer tokenizer = new TokenizerME (langCode );
45- POSTagger tagger = new POSTaggerME (langCode );
63+ void testPOSTagger (String langCode , int allowedDelta , String input , String [] expectedTags ) {
4664
47- String [] tokens = tokenizer .tokenize (input );
65+ final String [] tokens = TOKENIZERS . get ( langCode ) .tokenize (input );
4866 assertNotNull (tokens );
4967 assertEquals (expectedTags .length , tokens .length );
50- String [] tags = tagger .tag (tokens );
68+ final String [] tags = TAGGERS . get ( langCode ) .tag (tokens );
5169 assertNotNull (tags );
5270 assertEquals (expectedTags .length , tags .length );
5371 StringBuilder fullyTagged = new StringBuilder ();
@@ -71,24 +89,23 @@ void testPOSTagger(String langCode, String input, String[] expectedTags) throws
7189 if (debug ) {
7290 System .out .println (sb );
7391 }
74- // assertEquals(expectedTags[i], tags[i]);
7592 }
76- assertTrue (incorrectTagsPositions .size () <= 1 );
93+ assertTrue (incorrectTagsPositions .size () <= allowedDelta );
7794 }
7895
7996 private static Stream <Arguments > provideData () {
8097 return Stream .of (
8198 // see: Dev Manual
82- Arguments .of ("en" ,
99+ Arguments .of (ENGLISH , 0 ,
83100 "Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group ." ,
84101 new String []{"PROPN" , "PROPN" , "AUX" , "NOUN" , "ADP" , "ADJ" , "PROPN" , "PUNCT" , "DET" , "PROPN" ,
85102 "VERB" , "NOUN" , "PUNCT" }),
86103 // see: 'de-ud-train-sample.conllu'
87- Arguments .of ("de" ,
104+ Arguments .of (GERMAN , 0 ,
88105 "Fachlich kompetent, sehr gute Beratung und ein freundliches Team ." ,
89- new String []{"ADV " , "ADJ" , "PUNCT" , "ADV" , "ADJ" , "NOUN" , "CCONJ" , "DET" , "ADJ" , "NOUN" , "PUNCT" }),
106+ new String []{"ADJ " , "ADJ" , "PUNCT" , "ADV" , "ADJ" , "NOUN" , "CCONJ" , "DET" , "ADJ" , "NOUN" , "PUNCT" }),
90107 // see: 'pt-br-ud-sample.conllu'
91- Arguments .of ("pt" ,
108+ Arguments .of (PORTUGUESE , 1 ,
92109 "Numa reunião entre representantes da Secretaria da Criança do DF ea juíza da Vara de Execuções de " +
93110 "Medidas Socioeducativas, Lavínia Tupi Vieira Fonseca, ficou acordado que dos 25 internos, " +
94111 "12 serão internados na Unidade de Planaltina e os outros 13 devem retornar para a Unidade do " +
@@ -100,9 +117,32 @@ private static Stream<Arguments> provideData() {
100117 "PUNCT" , "NUM" , "AUX" , "VERB" , "ADP+DET" , "PROPN" , "ADP" , "PROPN" , "CCONJ" , "DET" , "DET" , "NUM" ,
101118 "AUX" , "VERB" , "ADP" , "DET" , "PROPN" , "ADP+DET" , "PROPN" , "ADP+DET" , "PROPN" , "PUNCT" , "ADJ" ,
102119 "PROPN" , "PUNCT" }),
103- // see: @kinow
104- Arguments .of ("ca" ,
105- "Un gran embossament d'aire fred es comença a despenjar cap al centre d'Europa." ,
120+ // via @alsmolarczyk, original by Lem, Stanisław (1961/2022):
121+ // Solaris, Wydawnictwo Literackie, Kraków, S. 81.
122+ Arguments .of (POLISH , 1 ,
123+ "Zerwałem się ze stosu zwiniętych spadochronów i pobiegłem prosto do radiostacji ." ,
124+ new String []{"VERB+AUX" , "PART" , "ADP" , "NOUN" , "ADJ" , "NOUN" , "CCONJ" , "VERB+AUX" , "ADV" , "ADP" ,
125+ "NOUN" , "PUNCT" }),
126+ // via @alsmolarczyk, original by Tokarczuk, Olga (2009/2021):
127+ // Prowadź swój pług przez kości umarłych, Wydawnictwo Literackie, Kraków, S. 43-44.
128+ // TODO ask @alsmolarczyk for POS-tagger output:
129+ // Więzienie_NOUN nie_PART tkwi_VERB na_ADP zewnątrz_ADV ,_PUNCT
130+ // ale_CCONJ jest_VERB w_ADP środku_NOUN każdego_DET z_ADP nas_PRON ._PUNCT
131+ Arguments .of (POLISH , 2 ,
132+ "Więzienie nie tkwi na zewnątrz, ale jest w środku każdego z nas ." ,
133+ new String []{"NOUN" , "PART" , "VERB" , "ADP" , "NOUN" , "PUNCT" , "CCONJ" , "VERB" , "ADP" , "NOUN" ,
134+ "PRON" , "ADP" , "PRON" , "PUNCT" }),
135+ // via @alsmolarczyk, original by Zalega, Dariusz (2019):
136+ // Śląsk zbuntowany, Wydawnictwo Czarne, Wołowiec, S. 96.
137+ Arguments .of (POLISH , 0 ,
138+ "Działacze stosowali też różne formy nacisku na polski konsulat , żeby zaopiekował się " +
139+ "bezrobotnymi z Polski albo dał im choćby na bezpłatny bilet do kraju ." ,
140+ new String []{"NOUN" , "VERB" , "PART" , "ADJ" , "NOUN" , "NOUN" , "ADP" , "ADJ" , "NOUN" , "PUNCT" , "SCONJ" ,
141+ "VERB" , "PRON" , "ADJ" , "ADP" , "PROPN" , "CCONJ" , "VERB" , "PRON" , "PART" , "ADP" , "ADJ" , "NOUN" ,
142+ "ADP" , "NOUN" , "PUNCT" }),
143+ // via: @kinow
144+ Arguments .of (CATALAN , 1 ,
145+ "Un gran embossament d'aire fred es comença a despenjar cap al centre d'Europa." ,
106146 // OpenNLP, different at: idx pos 2, 3, 5, and 13(+14) -> however, only pos 5 is "wrong" (ref)
107147 new String []{"DET" , "ADJ" , "NOUN" , "ADP" , "NOUN" , "ADJ" , "PRON" , "VERB" , "ADP" , "VERB" , "NOUN" ,
108148 "ADP+DET" , "NOUN" , "ADP" , "PROPN" , "PUNCT" })
@@ -115,7 +155,6 @@ private static Stream<Arguments> provideData() {
115155 // "NOUN", "PROPN", "PROPN", "PUNCT"
116156 // ok! , ok! , ??? , ??? , ok! , ok! , ok! , ok! , ok! , ok! , ok! , ok! + ok! ,
117157 // ok! , ??? , ok! , ok!
118-
119158 );
120159 }
121160}
0 commit comments