55import edu .stanford .nlp .util .*;
66
77import static org .junit .Assert .assertEquals ;
8+ import static org .junit .Assert .assertFalse ;
9+ import static org .junit .Assert .assertNotNull ;
10+ import static org .junit .Assert .assertTrue ;
811
912import java .io .*;
13+ import java .util .ArrayList ;
14+ import java .util .List ;
1015import java .util .Properties ;
1116
1217import org .junit .Before ;
@@ -24,53 +29,278 @@ public class CoNLLUReaderITest {
2429 public Annotation goldDocument ;
2530 public Annotation readInDocument ;
2631
27- @ Before
28- public void setUp () throws IOException {
29- // set up the pipeline
30- Properties props = LanguageInfo .getLanguageProperties ("spanish" );
31- props .put ("annotators" , "tokenize,ssplit,mwt,pos,lemma,depparse" );
32- pipeline = new StanfordCoreNLP (props );
33- }
32+ static final String [] EXPECTED_SENTENCE_TEXT = {
33+ "Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica." ,
34+ "De allí las rebajas."
35+ };
36+ static final String EXPECTED_TEXT = String .join (System .lineSeparator (), EXPECTED_SENTENCE_TEXT ) + System .lineSeparator ();
37+
38+ static final String [][] EXPECTED_WORD_TEXT = {
39+ {"Pero" , "la" , "existencia" , "de" , "dos" , "recién" , "nacidos" , "en" , "la" , "misma" , "caja" , "sólo" , "podía" , "deber" , "se" , "a" , "un" , "descuido" , "de" , "fábrica" , "." },
40+ {"De" , "allí" , "las" , "rebajas" , "." },
41+ };
42+
43+ static final String [][] EXPECTED_LEMMA_TEXT = {
44+ {"pero" , "el" , "existencia" , "de" , "dos" , "recién" , "nacido" , "en" , "el" , "mismo" , "caja" , "sólo" , "poder" , "deber" , "él" , "a" , "uno" , "descuido" , "de" , "fábrica" , "." },
45+ {"de" , "allí" , "el" , "rebaja" , "." },
46+ };
47+
48+ static final String [][] EXPECTED_CPOS = {
49+ {"CCONJ" , "DET" , "NOUN" , "ADP" , "NUM" , "ADV" , "ADJ" , "ADP" , "DET" , "DET" , "NOUN" , "ADV" , "AUX" , "VERB" , "PRON" , "ADP" , "DET" , "NOUN" , "ADP" , "NOUN" , "PUNCT" },
50+ {"ADP" , "ADV" , "DET" , "NOUN" , "PUNCT" },
51+ };
52+
53+ static final String [][] EXPECTED_FEATS = {
54+ {
55+ null ,
56+ "Definite=Def|Gender=Fem|Number=Sing|PronType=Art" ,
57+ "Gender=Fem|Number=Sing" ,
58+ null ,
59+ "Number=Plur|NumForm=Word|NumType=Card" ,
60+ null ,
61+ "Gender=Masc|Number=Plur|VerbForm=Part" ,
62+ null ,
63+ "Definite=Def|Gender=Fem|Number=Sing|PronType=Art" ,
64+ "Gender=Fem|Number=Sing|PronType=Dem" ,
65+ "Gender=Fem|Number=Sing" ,
66+ null ,
67+ "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin" ,
68+ "VerbForm=Inf" ,
69+ "Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes" ,
70+ null ,
71+ "Definite=Ind|Gender=Masc|Number=Sing|PronType=Art" ,
72+ "Gender=Masc|Number=Sing" ,
73+ null ,
74+ "Gender=Fem|Number=Sing" ,
75+ "PunctType=Peri" ,
76+ },
77+ {
78+ null ,
79+ null ,
80+ "Definite=Def|Gender=Fem|Number=Plur|PronType=Art" ,
81+ "Gender=Fem|Number=Plur" ,
82+ "PunctType=Peri" ,
83+ }
84+ };
85+
86+ static final String [][] EXPECTED_RELNS = {
87+ { "advmod" , "det" , "nsubj" , "case" , "nummod" , "advmod" , "amod" , "case" , "det" , "det" , "nmod" , "advmod" , "aux" , "root" ,
88+ "expl:pv" , "case" , "det" , "obl:arg" , "case" , "nmod" , "punct" },
89+ { "case" , "advmod" , "det" , "root" , "punct" },
90+ };
91+ static final int [][] EXPECTED_HEADS = {
92+ { 14 , 3 , 14 , 7 , 7 , 7 , 3 , 11 , 11 , 9 , 3 , 14 , 14 , 0 , 14 , 18 , 18 , 14 , 20 , 18 , 14 },
93+ { 2 , 4 , 4 , 0 , 4 },
94+ };
3495
3596 @ Test
3697 public void testReadingInCoNLLUFile () throws ClassNotFoundException , IOException {
37- goldDocument = pipeline .process (exampleDocument );
3898 readInDocument = new CoNLLUReader (new Properties ()).readCoNLLUFile (examplePath ).get (0 );
39- // make some changes for sake of comparison
40- // remove AfterAnnotation from read in
41- // remove ParentAnnotation from gold
42- for (CoreLabel token : goldDocument .get (CoreAnnotations .TokensAnnotation .class )) {
43- token .remove (CoreAnnotations .ParentAnnotation .class );
99+
100+ assertTrue (readInDocument .containsKey (CoreAnnotations .TextAnnotation .class ));
101+ assertTrue (readInDocument .containsKey (CoreAnnotations .TokensAnnotation .class ));
102+ assertTrue (readInDocument .containsKey (CoreAnnotations .SentencesAnnotation .class ));
103+ assertEquals (3 , readInDocument .keySet ().size ());
104+
105+ // Compare text of the document and its sentences
106+ assertEquals (EXPECTED_TEXT , readInDocument .get (CoreAnnotations .TextAnnotation .class ));
107+ List <CoreMap > sentences = readInDocument .get (CoreAnnotations .SentencesAnnotation .class );
108+ assertEquals (EXPECTED_SENTENCE_TEXT .length , sentences .size ());
109+ for (int i = 0 ; i < EXPECTED_SENTENCE_TEXT .length ; ++i ) {
110+ assertEquals (EXPECTED_SENTENCE_TEXT [i ], sentences .get (i ).get (CoreAnnotations .TextAnnotation .class ));
44111 }
45- for (CoreLabel token : readInDocument .get (CoreAnnotations .TokensAnnotation .class )) {
46- token .remove (CoreAnnotations .CoNLLUFeats .class );
112+
113+ // Compare sentence ids
114+ // Check number of keys on each sentence
115+ for (int i = 0 ; i < sentences .size (); ++i ) {
116+ assertEquals (Integer .valueOf (i ), sentences .get (i ).get (CoreAnnotations .SentenceIndexAnnotation .class ));
117+ assertEquals (4 , sentences .get (i ).keySet ().size ());
118+ }
119+
120+ // Check the document tokens and the sentence tokens lists are the same
121+ // The composite list on the document level should just be the sentence tokens gathered into one list
122+ List <CoreMap > allTokens = new ArrayList <>();
123+ for (int i = 0 ; i < sentences .size (); ++i ) {
124+ allTokens .addAll (sentences .get (i ).get (CoreAnnotations .TokensAnnotation .class ));
47125 }
48- // compare gold vs. read in
49- // compare document text
50- assertEquals (goldDocument .get (CoreAnnotations .TextAnnotation .class ),
51- readInDocument .get (CoreAnnotations .TextAnnotation .class ));
52- // compare tokens lists
53- AnnotationComparator .compareTokensLists (goldDocument , readInDocument );
54- assertEquals (goldDocument .get (CoreAnnotations .TokensAnnotation .class ),
55- readInDocument .get (CoreAnnotations .TokensAnnotation .class ));
56- // compare sentences
57- for (int i = 0 ; i < goldDocument .get (CoreAnnotations .SentencesAnnotation .class ).size (); i ++) {
58- CoreMap goldSentence = goldDocument .get (CoreAnnotations .SentencesAnnotation .class ).get (i );
59- CoreMap readInSentence = readInDocument .get (CoreAnnotations .SentencesAnnotation .class ).get (i );
60- // compare sentence text
61- assertEquals (goldSentence .get (CoreAnnotations .TextAnnotation .class ),
62- readInSentence .get (CoreAnnotations .TextAnnotation .class ));
63- // compare token lists
64- assertEquals (goldSentence .get (CoreAnnotations .TokensAnnotation .class ),
65- readInSentence .get (CoreAnnotations .TokensAnnotation .class ));
66- // compare semantic graphs
67- SemanticGraph goldGraph =
68- goldDocument .get (CoreAnnotations .SentencesAnnotation .class ).get (i ).get (
69- SemanticGraphCoreAnnotations .BasicDependenciesAnnotation .class );
70- SemanticGraph readInGraph =
71- goldDocument .get (CoreAnnotations .SentencesAnnotation .class ).get (i ).get (
72- SemanticGraphCoreAnnotations .BasicDependenciesAnnotation .class );
73- assertEquals (goldGraph .toList (), readInGraph .toList ());
126+ assertEquals (readInDocument .get (CoreAnnotations .TokensAnnotation .class ), allTokens );
127+
128+ // Check the text on each of the words
129+ // Check the lemmas
130+ // Check indices and a couple other annotations we expect to be here
131+ for (int i = 0 ; i < sentences .size (); ++i ) {
132+ CoreMap sentence = sentences .get (i );
133+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
134+ assertEquals (EXPECTED_WORD_TEXT [i ].length , tokens .size ());
135+ assertEquals (EXPECTED_LEMMA_TEXT [i ].length , tokens .size ());
136+ assertEquals (EXPECTED_CPOS [i ].length , tokens .size ());
137+ for (int j = 0 ; j < tokens .size (); ++j ) {
138+ CoreLabel token = tokens .get (j );
139+ assertEquals (EXPECTED_WORD_TEXT [i ][j ], token .value ());
140+ assertEquals (EXPECTED_WORD_TEXT [i ][j ], token .word ());
141+ assertEquals (EXPECTED_WORD_TEXT [i ][j ], token .get (CoreAnnotations .OriginalTextAnnotation .class ));
142+
143+ assertEquals (EXPECTED_LEMMA_TEXT [i ][j ], token .lemma ());
144+ assertEquals (EXPECTED_CPOS [i ][j ], token .tag ());
145+
146+ assertEquals (Integer .valueOf (i ), token .get (CoreAnnotations .SentenceIndexAnnotation .class ));
147+ assertEquals (Integer .valueOf (j +1 ), token .get (CoreAnnotations .IndexAnnotation .class ));
148+
149+ // all tokens should have a False isNewline
150+ assertFalse (token .get (CoreAnnotations .IsNewlineAnnotation .class ));
151+ }
152+ }
153+
154+ // Check the MWT features
155+ for (int i = 0 ; i < sentences .size (); ++i ) {
156+ CoreMap sentence = sentences .get (i );
157+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
158+ for (int j = 0 ; j < tokens .size (); ++j ) {
159+ CoreLabel token = tokens .get (j );
160+ // words 14-15 (indexed one lower here) are the only MWT in this document
161+ // otherwise, all fields should be false
162+ if (i == 0 && j == 13 ) {
163+ assertTrue (token .get (CoreAnnotations .IsMultiWordTokenAnnotation .class ));
164+ assertTrue (token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ));
165+ assertEquals ("deberse" , token .get (CoreAnnotations .MWTTokenTextAnnotation .class ));
166+ } else if (i == 0 && j == 14 ) {
167+ assertTrue (token .get (CoreAnnotations .IsMultiWordTokenAnnotation .class ));
168+ assertFalse (token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ));
169+ assertEquals ("deberse" , token .get (CoreAnnotations .MWTTokenTextAnnotation .class ));
170+ } else {
171+ assertFalse (token .get (CoreAnnotations .IsMultiWordTokenAnnotation .class ));
172+ assertFalse (token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ));
173+ assertFalse (token .containsKey (CoreAnnotations .MWTTokenTextAnnotation .class ));
174+ }
175+ }
176+ }
177+
178+ // Check the Before & After features
179+ // TODO: May need to reconsider the end of sentence treatment
180+ for (int i = 0 ; i < sentences .size (); ++i ) {
181+ CoreMap sentence = sentences .get (i );
182+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
183+ for (int j = 0 ; j < tokens .size (); ++j ) {
184+ CoreLabel token = tokens .get (j );
185+ if (j == tokens .size () - 1 ) {
186+ assertEquals ("\n " , token .after ());
187+ } else if (j == tokens .size () - 2 ) {
188+ assertEquals ("" , token .after ());
189+ } else {
190+ // TODO: after() should be "" for an MWT
191+ // it just doesn't get marked on the CoNLLU
192+ assertEquals (" " , token .after ());
193+ }
194+
195+ if (i == 0 && j == 0 ) {
196+ assertEquals ("" , token .before ());
197+ } else if (j == 0 ) {
198+ assertEquals ("\n " , token .before ());
199+ } else if (j == tokens .size () - 1 ) {
200+ assertEquals ("" , token .before ());
201+ } else {
202+ assertEquals (" " , token .before ());
203+ }
204+ }
205+ }
206+
207+ // Check that these fields are set
208+ // Perhaps not checking the values of the offsets, though
209+ int tokenCount = 0 ;
210+ for (int i = 0 ; i < sentences .size (); ++i ) {
211+ CoreMap sentence = sentences .get (i );
212+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
213+ for (int j = 0 ; j < tokens .size (); ++j ) {
214+ CoreLabel token = tokens .get (j );
215+ assertTrue (token .containsKey (CoreAnnotations .CharacterOffsetBeginAnnotation .class ));
216+ assertTrue (token .containsKey (CoreAnnotations .CharacterOffsetEndAnnotation .class ));
217+ assertEquals (Integer .valueOf (tokenCount ), token .get (CoreAnnotations .TokenBeginAnnotation .class ));
218+ assertEquals (Integer .valueOf (tokenCount +1 ), token .get (CoreAnnotations .TokenEndAnnotation .class ));
219+ ++tokenCount ;
220+ }
221+ }
222+
223+ // check the features and that there are no fields currently unaccounted for
224+ for (int i = 0 ; i < sentences .size (); ++i ) {
225+ CoreMap sentence = sentences .get (i );
226+ List <CoreLabel > tokens = sentence .get (CoreAnnotations .TokensAnnotation .class );
227+ assertEquals (EXPECTED_FEATS [i ].length , tokens .size ());
228+ for (int j = 0 ; j < tokens .size (); ++j ) {
229+ CoreLabel token = tokens .get (j );
230+
231+ String expected = EXPECTED_FEATS [i ][j ];
232+ int expectedKeys = 16 ;
233+
234+ if (expected == null ) {
235+ assertFalse (token .containsKey (CoreAnnotations .CoNLLUFeats .class ));
236+ } else {
237+ expectedKeys += 1 ;
238+ String feats = token .get (CoreAnnotations .CoNLLUFeats .class ).toString ();
239+ assertEquals (expected , feats );
240+ }
241+
242+ // the MWT token specifically gets one more field, the MWT text
243+ if (i == 0 && (j == 13 || j == 14 )) {
244+ expectedKeys += 1 ;
245+ }
246+ assertEquals (expectedKeys , token .keySet ().size ());
247+
248+ // The known fields should be the ones checked above:
249+ // CoreAnnotations.TextAnnotation
250+ // CoreAnnotations.ValueAnnotation
251+ // CoreAnnotations.OriginalTextAnnotation
252+ // CoreAnnotations.IsNewlineAnnotation
253+ // CoreAnnotations.LemmaAnnotation
254+ // CoreAnnotations.PartOfSpeechAnnotation
255+ // CoreAnnotations.IndexAnnotation
256+ // CoreAnnotations.AfterAnnotation
257+ // CoreAnnotations.BeforeAnnotation
258+ // CoreAnnotations.IsMultiWordTokenAnnotation
259+ // CoreAnnotations.IsFirstWordOfMWTAnnotation
260+ // CoreAnnotations.CharacterOffsetBeginAnnotation
261+ // CoreAnnotations.CharacterOffsetEndAnnotation
262+ // CoreAnnotations.TokenBeginAnnotation
263+ // CoreAnnotations.TokenEndAnnotation
264+ // CoreAnnotations.SentenceIndexAnnotation
265+ // and sometimes
266+ // CoreAnnotations.CoNLLUFeats
267+ // CoreAnnotations.MWTTokenTextAnnotation
268+ //
269+ // TODO: make it always add a Feats, even if it's not present?
270+ }
271+ }
272+
273+ // compare the SemanticGraph
274+ for (int i = 0 ; i < sentences .size (); ++i ) {
275+ CoreMap sentence = sentences .get (i );
276+ SemanticGraph graph = sentence .get (SemanticGraphCoreAnnotations .BasicDependenciesAnnotation .class );
277+ assertNotNull (graph );
278+
279+ List <IndexedWord > vertices = graph .vertexListSorted ();
280+ assertEquals (EXPECTED_WORD_TEXT [i ].length , vertices .size ());
281+ assertEquals (EXPECTED_RELNS [i ].length , vertices .size ());
282+ assertEquals (EXPECTED_HEADS [i ].length , vertices .size ());
283+ for (int j = 0 ; j < vertices .size (); ++j ) {
284+ IndexedWord vertex = vertices .get (j );
285+ assertEquals (EXPECTED_WORD_TEXT [i ][j ], vertex .value ());
286+
287+ // each word should be properly indexed with the sentIndex and position in the sentence
288+ assertEquals (i , vertex .sentIndex ());
289+ // j+1 because the arrows are laid out with 0 as root, words with a 1-based index
290+ assertEquals (j +1 , vertex .index ());
291+
292+ if (EXPECTED_HEADS [i ][j ] == 0 ) {
293+ assertTrue (graph .isRoot (vertex ));
294+ continue ;
295+ }
296+
297+ // If not a root, then the word should have exactly one parent
298+ // The HEAD and RELNS arrays specify the expected parent and relation of the edge
299+ List <SemanticGraphEdge > edges = graph .getIncomingEdgesSorted (vertex );
300+ assertEquals (1 , edges .size ());
301+ assertEquals (EXPECTED_HEADS [i ][j ], edges .get (0 ).getGovernor ().index ());
302+ assertEquals (EXPECTED_RELNS [i ][j ], edges .get (0 ).getRelation ().toString ());
303+ }
74304 }
75305 }
76306}
0 commit comments