Update the CoNLLUReader test to read from the actual AnCora UD sentences, using expected gold values instead of reparsed values as the gold. This will let us test reading things the pipeline doesn't produce (such as UPOS vs XPOS or an enchanced graph) once those features are added

AngledLuffa · AngledLuffa · commit 3996bacecd39 · 2025-05-27T08:48:16.000-07:00
diff --git a/data/edu/stanford/nlp/pipeline/es-example.conllu b/data/edu/stanford/nlp/pipeline/es-example.conllu
@@ -1,35 +1,35 @@
-# sent_id = dev-s48
+# sent_id = 3LB-CAST-a22-0-s10
 # text = Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.
 # orig_file_sentence 001#48
-1	Pero	pero	CCONJ	CCONJ	_	14	advmod	_	_
-2	la	la	DET	DET	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	3	det	_	_
-3	existencia	existencia	NOUN	NOUN	Gender=Fem|Number=Sing	14	nsubj	_	_
-4	de	de	ADP	ADP	AdpType=Prep	5	case	_	_
-5	dos	dos	NUM	NUM	Number=Plur|NumType=Card	3	nummod	_	_
-6	recién	recién	ADV	ADV	_	7	advmod	_	_
-7	nacidos	nacidos	ADJ	ADJ	Gender=Masc|Number=Plur|VerbForm=Part	5	amod	_	_
-8	en	en	ADP	ADP	AdpType=Prep	11	case	_	_
-9	la	la	DET	DET	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	11	det	_	_
-10	misma	misma	DET	DET	Gender=Fem|Number=Sing|PronType=Ind	9	det	_	_
-11	caja	caja	NOUN	NOUN	Gender=Fem|Number=Sing	3	nmod	_	_
-12	sólo	sólo	ADV	ADV	_	14	advmod	_	_
-13	podía	podía	AUX	AUX	Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin	14	aux	_	_
+1	Pero	pero	CCONJ	cc	_	14	advmod	14:advmod	_
+2	la	el	DET	da0fs0	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	3	det	3:det	_
+3	existencia	existencia	NOUN	ncfs000	Gender=Fem|Number=Sing	14	nsubj	14:nsubj	ArgTem=arg1:tem
+4	de	de	ADP	sps00	_	7	case	7:case	_
+5	dos	dos	NUM	dn0cp0	Number=Plur|NumForm=Word|NumType=Card	7	nummod	7:nummod	_
+6	recién	recién	ADV	rg	_	7	advmod	7:advmod	ArgTem=argM:tmp
+7	nacidos	nacido	ADJ	aq0mpp	Gender=Masc|Number=Plur|VerbForm=Part	3	amod	3:amod	ArgTem=arg1:tem
+8	en	en	ADP	sps00	_	11	case	11:case	_
+9	la	el	DET	da0fs0	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	11	det	11:det	_
+10	misma	mismo	DET	di0fs0	Gender=Fem|Number=Sing|PronType=Dem	9	det	9:det	_
+11	caja	caja	NOUN	ncfs000	Gender=Fem|Number=Sing	3	nmod	3:nmod	ArgTem=argM:loc
+12	sólo	sólo	ADV	rg	_	14	advmod	14:advmod	ArgTem=argM:adv
+13	podía	poder	AUX	vmii3s0	Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin	14	aux	14:aux	_
 14-15	deberse	_	_	_	_	_	_	_	_
-14	deber	deber	VERB	VERB	VerbForm=Inf	0	root	_	_
-15	se	se	PRON	PRON	Person=3	14	obj	_	_
-16	a	a	ADP	ADP	AdpType=Prep	18	case	_	_
-17	un	un	DET	DET	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	18	det	_	_
-18	descuido	descuido	NOUN	NOUN	Gender=Masc|Number=Sing	14	obj	_	_
-19	de	de	ADP	ADP	AdpType=Prep	20	case	_	_
-20	fábrica	fábrica	NOUN	NOUN	Gender=Fem|Number=Sing	18	nmod	_	SpaceAfter=No
-21	.	.	PUNCT	PUNCT	PunctType=Peri	14	punct	_	_
+14	deber	deber	VERB	vmn0000	VerbForm=Inf	0	root	0:root	_
+15	se	él	PRON	_	Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes	14	expl:pv	14:expl:pv	_
+16	a	a	ADP	sps00	_	18	case	18:case	_
+17	un	uno	DET	di0ms0	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	18	det	18:det	_
+18	descuido	descuido	NOUN	ncms000	Gender=Masc|Number=Sing	14	obl:arg	14:obl:arg	ArgTem=arg2:atr
+19	de	de	ADP	sps00	_	20	case	20:case	_
+20	fábrica	fábrica	NOUN	ncfs000	Gender=Fem|Number=Sing	18	nmod	18:nmod	SpaceAfter=No|ArgTem=arg0:agt
+21	.	.	PUNCT	fp	PunctType=Peri	14	punct	14:punct	_
 
-# sent_id = dev-s49
+# sent_id = 3LB-CAST-a22-0-s11
 # text = De allí las rebajas.
 # orig_file_sentence 001#49
-1	De	de	ADP	ADP	AdpType=Prep	2	case	_	_
-2	allí	allí	ADV	ADV	_	4	advmod	_	_
-3	las	las	DET	DET	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	4	det	_	_
-4	rebajas	rebajas	NOUN	NOUN	Gender=Fem|Number=Plur	0	root	_	SpaceAfter=No
-5	.	.	PUNCT	PUNCT	PunctType=Peri	4	punct	_	_
+1	De	de	ADP	sps00	_	2	case	2:case	_
+2	allí	allí	ADV	rg	_	4	advmod	4:advmod	_
+3	las	el	DET	da0fp0	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	4	det	4:det	_
+4	rebajas	rebaja	NOUN	ncfp000	Gender=Fem|Number=Plur	0	root	0:root	SpaceAfter=No
+5	.	.	PUNCT	fp	PunctType=Peri	4	punct	4:punct	_
 
diff --git a/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java b/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java
@@ -5,8 +5,13 @@
 import edu.stanford.nlp.util.*;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 
 import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Properties;
 
 import org.junit.Before;
@@ -24,53 +29,278 @@ public class CoNLLUReaderITest {
   public Annotation goldDocument;
   public Annotation readInDocument;
 
-  @Before
-  public void setUp() throws IOException {
-    // set up the pipeline
-    Properties props = LanguageInfo.getLanguageProperties("spanish");
-    props.put("annotators", "tokenize,ssplit,mwt,pos,lemma,depparse");
-    pipeline = new StanfordCoreNLP(props);
-  }
+  static final String[] EXPECTED_SENTENCE_TEXT = {
+    "Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.",
+    "De allí las rebajas."
+  };
+  static final String EXPECTED_TEXT = String.join(System.lineSeparator(), EXPECTED_SENTENCE_TEXT) + System.lineSeparator();
+
+  static final String[][] EXPECTED_WORD_TEXT = {
+    {"Pero", "la", "existencia", "de", "dos", "recién", "nacidos", "en", "la", "misma", "caja", "sólo", "podía", "deber", "se", "a", "un", "descuido", "de", "fábrica", "."},
+    {"De", "allí", "las", "rebajas", "."},
+  };
+
+  static final String[][] EXPECTED_LEMMA_TEXT = {
+    {"pero", "el", "existencia", "de", "dos", "recién", "nacido", "en", "el", "mismo", "caja", "sólo", "poder", "deber", "él", "a", "uno", "descuido", "de", "fábrica", "."},
+    {"de", "allí", "el", "rebaja", "."},
+  };
+
+  static final String[][] EXPECTED_CPOS = {
+    {"CCONJ", "DET", "NOUN", "ADP", "NUM", "ADV", "ADJ", "ADP", "DET", "DET", "NOUN", "ADV", "AUX", "VERB", "PRON", "ADP", "DET", "NOUN", "ADP", "NOUN", "PUNCT"},
+    {"ADP", "ADV", "DET", "NOUN", "PUNCT"},
+  };
+
+  static final String[][] EXPECTED_FEATS = {
+    {
+      null,
+      "Definite=Def|Gender=Fem|Number=Sing|PronType=Art",
+      "Gender=Fem|Number=Sing",
+      null,
+      "Number=Plur|NumForm=Word|NumType=Card",
+      null,
+      "Gender=Masc|Number=Plur|VerbForm=Part",
+      null,
+      "Definite=Def|Gender=Fem|Number=Sing|PronType=Art",
+      "Gender=Fem|Number=Sing|PronType=Dem",
+      "Gender=Fem|Number=Sing",
+      null,
+      "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
+      "VerbForm=Inf",
+      "Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes",
+      null,
+      "Definite=Ind|Gender=Masc|Number=Sing|PronType=Art",
+      "Gender=Masc|Number=Sing",
+      null,
+      "Gender=Fem|Number=Sing",
+      "PunctType=Peri",
+    },
+    {
+      null,
+      null,
+      "Definite=Def|Gender=Fem|Number=Plur|PronType=Art",
+      "Gender=Fem|Number=Plur",
+      "PunctType=Peri",
+    }
+  };
+
+  static final String[][] EXPECTED_RELNS = {
+    { "advmod", "det", "nsubj", "case", "nummod", "advmod", "amod", "case", "det", "det", "nmod", "advmod", "aux", "root",
+      "expl:pv", "case", "det", "obl:arg", "case", "nmod", "punct" },
+    { "case", "advmod", "det", "root", "punct" },
+  };
+  static final int[][] EXPECTED_HEADS = {
+    { 14, 3, 14, 7, 7, 7, 3, 11, 11, 9, 3, 14, 14, 0, 14, 18, 18, 14, 20, 18, 14 },
+    { 2, 4, 4, 0, 4 },
+  };
 
   @Test
   public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException {
-    goldDocument = pipeline.process(exampleDocument);
     readInDocument = new CoNLLUReader(new Properties()).readCoNLLUFile(examplePath).get(0);
-    // make some changes for sake of comparison
-    // remove AfterAnnotation from read in
-    // remove ParentAnnotation from gold
-    for (CoreLabel token : goldDocument.get(CoreAnnotations.TokensAnnotation.class)) {
-      token.remove(CoreAnnotations.ParentAnnotation.class);
+
+    assertTrue(readInDocument.containsKey(CoreAnnotations.TextAnnotation.class));
+    assertTrue(readInDocument.containsKey(CoreAnnotations.TokensAnnotation.class));
+    assertTrue(readInDocument.containsKey(CoreAnnotations.SentencesAnnotation.class));
+    assertEquals(3, readInDocument.keySet().size());
+
+    // Compare text of the document and its sentences
+    assertEquals(EXPECTED_TEXT, readInDocument.get(CoreAnnotations.TextAnnotation.class));
+    List<CoreMap> sentences = readInDocument.get(CoreAnnotations.SentencesAnnotation.class);
+    assertEquals(EXPECTED_SENTENCE_TEXT.length, sentences.size());
+    for (int i = 0; i < EXPECTED_SENTENCE_TEXT.length; ++i) {
+      assertEquals(EXPECTED_SENTENCE_TEXT[i], sentences.get(i).get(CoreAnnotations.TextAnnotation.class));
     }
-    for (CoreLabel token : readInDocument.get(CoreAnnotations.TokensAnnotation.class)) {
-      token.remove(CoreAnnotations.CoNLLUFeats.class);
+
+    // Compare sentence ids
+    // Check number of keys on each sentence
+    for (int i = 0; i < sentences.size(); ++i) {
+      assertEquals(Integer.valueOf(i), sentences.get(i).get(CoreAnnotations.SentenceIndexAnnotation.class));
+      assertEquals(4, sentences.get(i).keySet().size());
+    }
+
+    // Check the document tokens and the sentence tokens lists are the same
+    // The composite list on the document level should just be the sentence tokens gathered into one list
+    List<CoreMap> allTokens = new ArrayList<>();
+    for (int i = 0; i < sentences.size(); ++i) {
+      allTokens.addAll(sentences.get(i).get(CoreAnnotations.TokensAnnotation.class));
     }
-    // compare gold vs. read in
-    // compare document text
-    assertEquals(goldDocument.get(CoreAnnotations.TextAnnotation.class),
-        readInDocument.get(CoreAnnotations.TextAnnotation.class));
-    // compare tokens lists
-    AnnotationComparator.compareTokensLists(goldDocument, readInDocument);
-    assertEquals(goldDocument.get(CoreAnnotations.TokensAnnotation.class),
-        readInDocument.get(CoreAnnotations.TokensAnnotation.class));
-    // compare sentences
-    for (int i = 0; i < goldDocument.get(CoreAnnotations.SentencesAnnotation.class).size(); i++) {
-      CoreMap goldSentence = goldDocument.get(CoreAnnotations.SentencesAnnotation.class).get(i);
-      CoreMap readInSentence = readInDocument.get(CoreAnnotations.SentencesAnnotation.class).get(i);
-      // compare sentence text
-      assertEquals(goldSentence.get(CoreAnnotations.TextAnnotation.class),
-          readInSentence.get(CoreAnnotations.TextAnnotation.class));
-      // compare token lists
-      assertEquals(goldSentence.get(CoreAnnotations.TokensAnnotation.class),
-          readInSentence.get(CoreAnnotations.TokensAnnotation.class));
-      // compare semantic graphs
-      SemanticGraph goldGraph =
-          goldDocument.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(
-              SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
-      SemanticGraph readInGraph =
-          goldDocument.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(
-              SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
-      assertEquals(goldGraph.toList(), readInGraph.toList());
+    assertEquals(readInDocument.get(CoreAnnotations.TokensAnnotation.class), allTokens);
+
+    // Check the text on each of the words
+    // Check the lemmas
+    // Check indices and a couple other annotations we expect to be here
+    for (int i = 0; i < sentences.size(); ++i) {
+      CoreMap sentence = sentences.get(i);
+      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+      assertEquals(EXPECTED_WORD_TEXT[i].length, tokens.size());
+      assertEquals(EXPECTED_LEMMA_TEXT[i].length, tokens.size());
+      assertEquals(EXPECTED_CPOS[i].length, tokens.size());
+      for (int j = 0; j < tokens.size(); ++j) {
+        CoreLabel token = tokens.get(j);
+        assertEquals(EXPECTED_WORD_TEXT[i][j], token.value());
+        assertEquals(EXPECTED_WORD_TEXT[i][j], token.word());
+        assertEquals(EXPECTED_WORD_TEXT[i][j], token.get(CoreAnnotations.OriginalTextAnnotation.class));
+
+        assertEquals(EXPECTED_LEMMA_TEXT[i][j], token.lemma());
+        assertEquals(EXPECTED_CPOS[i][j], token.tag());
+
+        assertEquals(Integer.valueOf(i), token.get(CoreAnnotations.SentenceIndexAnnotation.class));
+        assertEquals(Integer.valueOf(j+1), token.get(CoreAnnotations.IndexAnnotation.class));
+
+        // all tokens should have a False isNewline
+        assertFalse(token.get(CoreAnnotations.IsNewlineAnnotation.class));
+      }
+    }
+
+    // Check the MWT features
+    for (int i = 0; i < sentences.size(); ++i) {
+      CoreMap sentence = sentences.get(i);
+      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+      for (int j = 0; j < tokens.size(); ++j) {
+        CoreLabel token = tokens.get(j);
+        // words 14-15 (indexed one lower here) are the only MWT in this document
+        // otherwise, all fields should be false
+        if (i == 0 && j == 13) {
+          assertTrue(token.get(CoreAnnotations.IsMultiWordTokenAnnotation.class));
+          assertTrue(token.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class));
+          assertEquals("deberse", token.get(CoreAnnotations.MWTTokenTextAnnotation.class));
+        } else if (i == 0 && j == 14) {
+          assertTrue(token.get(CoreAnnotations.IsMultiWordTokenAnnotation.class));
+          assertFalse(token.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class));
+          assertEquals("deberse", token.get(CoreAnnotations.MWTTokenTextAnnotation.class));
+        } else {
+          assertFalse(token.get(CoreAnnotations.IsMultiWordTokenAnnotation.class));
+          assertFalse(token.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class));
+          assertFalse(token.containsKey(CoreAnnotations.MWTTokenTextAnnotation.class));
+        }
+      }
+    }
+
+    // Check the Before & After features
+    // TODO: May need to reconsider the end of sentence treatment
+    for (int i = 0; i < sentences.size(); ++i) {
+      CoreMap sentence = sentences.get(i);
+      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+      for (int j = 0; j < tokens.size(); ++j) {
+        CoreLabel token = tokens.get(j);
+        if (j == tokens.size() - 1) {
+          assertEquals("\n", token.after());
+        } else if (j == tokens.size() - 2) {
+          assertEquals("", token.after());
+        } else {
+          // TODO: after() should be "" for an MWT
+          // it just doesn't get marked on the CoNLLU
+          assertEquals(" ", token.after());
+        }
+
+        if (i == 0 && j == 0) {
+          assertEquals("", token.before());
+        } else if (j == 0) {
+          assertEquals("\n", token.before());
+        } else if (j == tokens.size() - 1) {
+          assertEquals("", token.before());
+        } else {
+          assertEquals(" ", token.before());
+        }
+      }
+    }
+
+    // Check that these fields are set
+    // Perhaps not checking the values of the offsets, though
+    int tokenCount = 0;
+    for (int i = 0; i < sentences.size(); ++i) {
+      CoreMap sentence = sentences.get(i);
+      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+      for (int j = 0; j < tokens.size(); ++j) {
+        CoreLabel token = tokens.get(j);
+        assertTrue(token.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
+        assertTrue(token.containsKey(CoreAnnotations.CharacterOffsetEndAnnotation.class));
+        assertEquals(Integer.valueOf(tokenCount), token.get(CoreAnnotations.TokenBeginAnnotation.class));
+        assertEquals(Integer.valueOf(tokenCount+1), token.get(CoreAnnotations.TokenEndAnnotation.class));
+        ++tokenCount;
+      }
+    }
+
+    // check the features and that there are no fields currently unaccounted for
+    for (int i = 0; i < sentences.size(); ++i) {
+      CoreMap sentence = sentences.get(i);
+      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+      assertEquals(EXPECTED_FEATS[i].length, tokens.size());
+      for (int j = 0; j < tokens.size(); ++j) {
+        CoreLabel token = tokens.get(j);
+
+        String expected = EXPECTED_FEATS[i][j];
+        int expectedKeys = 16;
+
+        if (expected == null) {
+          assertFalse(token.containsKey(CoreAnnotations.CoNLLUFeats.class));
+        } else {
+          expectedKeys += 1;
+          String feats = token.get(CoreAnnotations.CoNLLUFeats.class).toString();
+          assertEquals(expected, feats);
+        }
+
+        // the MWT token specifically gets one more field, the MWT text
+        if (i == 0 && (j == 13 || j == 14)) {
+          expectedKeys += 1;
+        }
+        assertEquals(expectedKeys, token.keySet().size());
+
+        // The known fields should be the ones checked above:
+        //    CoreAnnotations.TextAnnotation
+        //    CoreAnnotations.ValueAnnotation
+        //    CoreAnnotations.OriginalTextAnnotation
+        //    CoreAnnotations.IsNewlineAnnotation
+        //    CoreAnnotations.LemmaAnnotation
+        //    CoreAnnotations.PartOfSpeechAnnotation
+        //    CoreAnnotations.IndexAnnotation
+        //    CoreAnnotations.AfterAnnotation
+        //    CoreAnnotations.BeforeAnnotation
+        //    CoreAnnotations.IsMultiWordTokenAnnotation
+        //    CoreAnnotations.IsFirstWordOfMWTAnnotation
+        //    CoreAnnotations.CharacterOffsetBeginAnnotation
+        //    CoreAnnotations.CharacterOffsetEndAnnotation
+        //    CoreAnnotations.TokenBeginAnnotation
+        //    CoreAnnotations.TokenEndAnnotation
+        //    CoreAnnotations.SentenceIndexAnnotation
+        // and sometimes
+        //    CoreAnnotations.CoNLLUFeats
+        //    CoreAnnotations.MWTTokenTextAnnotation
+        //
+        // TODO: make it always add a Feats, even if it's not present?
+      }
+    }
+
+    // compare the SemanticGraph
+    for (int i = 0; i < sentences.size(); ++i) {
+      CoreMap sentence = sentences.get(i);
+      SemanticGraph graph = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
+      assertNotNull(graph);
+
+      List<IndexedWord> vertices = graph.vertexListSorted();
+      assertEquals(EXPECTED_WORD_TEXT[i].length, vertices.size());
+      assertEquals(EXPECTED_RELNS[i].length, vertices.size());
+      assertEquals(EXPECTED_HEADS[i].length, vertices.size());
+      for (int j = 0; j < vertices.size(); ++j) {
+        IndexedWord vertex = vertices.get(j);
+        assertEquals(EXPECTED_WORD_TEXT[i][j], vertex.value());
+
+        // each word should be properly indexed with the sentIndex and position in the sentence
+        assertEquals(i, vertex.sentIndex());
+        // j+1 because the arrows are laid out with 0 as root, words with a 1-based index
+        assertEquals(j+1, vertex.index());
+
+        if (EXPECTED_HEADS[i][j] == 0) {
+          assertTrue(graph.isRoot(vertex));
+          continue;
+        }
+
+        // If not a root, then the word should have exactly one parent
+        // The HEAD and RELNS arrays specify the expected parent and relation of the edge
+        List<SemanticGraphEdge> edges = graph.getIncomingEdgesSorted(vertex);
+        assertEquals(1, edges.size());
+        assertEquals(EXPECTED_HEADS[i][j], edges.get(0).getGovernor().index());
+        assertEquals(EXPECTED_RELNS[i][j], edges.get(0).getRelation().toString());
+      }
     }
   }
 }