@@ -326,15 +326,14 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
326326 // build sentences
327327 List <CoreMap > sentences = new ArrayList <>();
328328 for (CoNLLUSentence sent : doc .sentences ) {
329- sentences .add (convertCoNLLUSentenceToCoreMap (doc , sent ));
329+ // pass in the sentences.size() so we can build the CoreLabels with the correct sentIndex()
330+ // this way, we don't mess up the hashCodes later
331+ sentences .add (convertCoNLLUSentenceToCoreMap (doc , sent , sentences .size ()));
330332 }
331333 // set sentences
332334 finalAnnotation .set (CoreAnnotations .SentencesAnnotation .class , sentences );
333335 // build document wide CoreLabels list
334- // TODO: do we need to put new SentenceIndexAnnotations on each of the IndexedWords?
335- // TODO: what about document annotation?
336- // We should confirm that setting the SentenceIndexAnnotation like this isn't
337- // distorting any of the SemanticGraphs
336+ // TODO: should we set document annotation?
338337 List <CoreLabel > tokens = new ArrayList <>();
339338 finalAnnotation .set (CoreAnnotations .TokensAnnotation .class , tokens );
340339 int documentIdx = 0 ;
@@ -351,15 +350,9 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
351350 for (CoreLabel token : sentence .get (CoreAnnotations .TokensAnnotation .class )) {
352351 token .set (CoreAnnotations .TokenBeginAnnotation .class , documentIdx );
353352 token .set (CoreAnnotations .TokenEndAnnotation .class , documentIdx + 1 );
354- token .set (CoreAnnotations .SentenceIndexAnnotation .class , sentenceIdx );
355353 tokens .add (token );
356354 documentIdx ++;
357355 }
358- if (sentence .containsKey (CoreAnnotations .EmptyTokensAnnotation .class )) {
359- for (CoreLabel token : sentence .get (CoreAnnotations .EmptyTokensAnnotation .class )) {
360- token .set (CoreAnnotations .SentenceIndexAnnotation .class , sentenceIdx );
361- }
362- }
363356 sentenceIdx ++;
364357 }
365358 // make sure to set docText AFTER all the above processing
@@ -389,9 +382,10 @@ public static final String rebuildMisc(Map<String, String> miscKeyValues) {
389382 /**
390383 * Convert a single ten column CoNLLU line into a CoreLabel
391384 */
392- public CoreLabel convertLineToCoreLabel (CoNLLUSentence sentence , String line ) {
385+ public CoreLabel convertLineToCoreLabel (CoNLLUSentence sentence , String line , int sentenceIdx ) {
393386 List <String > fields = Arrays .asList (line .split ("\t " ));
394387 CoreLabel cl = new CoreLabel ();
388+ cl .set (CoreAnnotations .SentenceIndexAnnotation .class , sentenceIdx );
395389
396390 String indexField = fields .get (CoNLLU_IndexField );
397391 int sentenceTokenIndex ;
@@ -522,12 +516,12 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
522516 /**
523517 * Convert a list of CoNLL-U token lines into a sentence CoreMap
524518 **/
525- public CoreMap convertCoNLLUSentenceToCoreMap (CoNLLUDocument doc , CoNLLUSentence sentence ) {
519+ public CoreMap convertCoNLLUSentenceToCoreMap (CoNLLUDocument doc , CoNLLUSentence sentence , int sentenceIdx ) {
526520 List <String > lines = sentence .tokenLines ;
527521 // create CoreLabels
528522 List <CoreLabel > coreLabels = new ArrayList <CoreLabel >();
529523 for (String line : lines ) {
530- CoreLabel cl = convertLineToCoreLabel (sentence , line );
524+ CoreLabel cl = convertLineToCoreLabel (sentence , line , sentenceIdx );
531525 coreLabels .add (cl );
532526 }
533527 for (int i = 1 ; i < coreLabels .size () ; i ++) {
@@ -570,7 +564,7 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
570564
571565 List <CoreLabel > emptyLabels = new ArrayList <CoreLabel >();
572566 for (String line : sentence .emptyLines ) {
573- CoreLabel cl = convertLineToCoreLabel (sentence , line );
567+ CoreLabel cl = convertLineToCoreLabel (sentence , line , sentenceIdx );
574568 emptyLabels .add (cl );
575569 }
576570
0 commit comments