@@ -354,89 +354,96 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
354354 }
355355
356356 /**
357- * Convert a list of CoNLL-U token lines into a sentence CoreMap
358- **/
359- public CoreMap convertCoNLLUSentenceToCoreMap (CoNLLUDocument doc , CoNLLUSentence sentence ) {
360- List <String > lines = sentence .tokenLines ;
361- // create CoreLabels
362- List <CoreLabel > coreLabels = new ArrayList <CoreLabel >();
363- int sentenceTokenIndex = 1 ;
364- for (String line : lines ) {
365- List <String > fields = Arrays .asList (line .split ("\t " ));
366- CoreLabel cl = new CoreLabel ();
367- cl .setWord (fields .get (CoNLLU_WordField ));
368- cl .setValue (fields .get (CoNLLU_WordField ));
369- cl .setOriginalText (fields .get (CoNLLU_WordField ));
370- cl .setIsNewline (false );
371-
372- if (!fields .get (CoNLLU_LemmaField ).equals ("_" ))
373- cl .setLemma (fields .get (CoNLLU_LemmaField ));
374-
375- if (!fields .get (CoNLLU_UPOSField ).equals ("_" ))
376- cl .set (CoreAnnotations .CoarseTagAnnotation .class , fields .get (CoNLLU_UPOSField ));
377-
378- final String xpos = fields .get (CoNLLU_XPOSField );
379- if (!xpos .equals ("_" ))
380- cl .setTag (xpos );
381-
382- if (!fields .get (CoNLLU_FeaturesField ).equals ("_" )) {
383- CoNLLUFeatures features = new CoNLLUFeatures (fields .get (CoNLLU_FeaturesField ));
384- cl .set (CoreAnnotations .CoNLLUFeats .class , features );
385- }
386- for (int extraColumnIdx = 10 ; extraColumnIdx < columnCount && extraColumnIdx < fields .size ();
387- extraColumnIdx ++) {
388- cl .set (extraColumns .get (extraColumnIdx ), fields .get (extraColumnIdx ));
357+ * Convert a single ten column CoNLLU line into a CoreLabel
358+ */
359+ public CoreLabel convertLineToCoreLabel (CoNLLUSentence sentence , String line ) {
360+ List <String > fields = Arrays .asList (line .split ("\t " ));
361+ CoreLabel cl = new CoreLabel ();
362+ int sentenceTokenIndex = Integer .valueOf (fields .get (CoNLLU_IndexField ));
363+ cl .setWord (fields .get (CoNLLU_WordField ));
364+ cl .setValue (fields .get (CoNLLU_WordField ));
365+ cl .setOriginalText (fields .get (CoNLLU_WordField ));
366+ cl .setIsNewline (false );
367+
368+ if (!fields .get (CoNLLU_LemmaField ).equals ("_" ))
369+ cl .setLemma (fields .get (CoNLLU_LemmaField ));
370+
371+ if (!fields .get (CoNLLU_UPOSField ).equals ("_" ))
372+ cl .set (CoreAnnotations .CoarseTagAnnotation .class , fields .get (CoNLLU_UPOSField ));
373+
374+ final String xpos = fields .get (CoNLLU_XPOSField );
375+ if (!xpos .equals ("_" ))
376+ cl .setTag (xpos );
377+
378+ if (!fields .get (CoNLLU_FeaturesField ).equals ("_" )) {
379+ CoNLLUFeatures features = new CoNLLUFeatures (fields .get (CoNLLU_FeaturesField ));
380+ cl .set (CoreAnnotations .CoNLLUFeats .class , features );
381+ }
382+ for (int extraColumnIdx = 10 ; extraColumnIdx < columnCount && extraColumnIdx < fields .size ();
383+ extraColumnIdx ++) {
384+ cl .set (extraColumns .get (extraColumnIdx ), fields .get (extraColumnIdx ));
385+ }
386+ cl .setIndex (sentenceTokenIndex );
387+
388+ // handle the MWT info
389+ if (sentence .mwtData .containsKey (sentenceTokenIndex - 1 )) {
390+ // set MWT text
391+ cl .set (CoreAnnotations .MWTTokenTextAnnotation .class ,
392+ sentence .mwtTokens .get (sentence .mwtData .get (sentenceTokenIndex - 1 )));
393+ cl .setIsMWT (true );
394+ // check if first
395+ if (sentence .mwtData .containsKey (sentenceTokenIndex - 2 ) &&
396+ sentence .mwtData .get (sentenceTokenIndex -2 ).equals (sentence .mwtData .get (sentenceTokenIndex -1 ))) {
397+ cl .setIsMWTFirst (false );
398+ } else {
399+ cl .setIsMWTFirst (true );
389400 }
390- cl .setIndex (sentenceTokenIndex );
391-
392- // handle the MWT info
393- if (sentence .mwtData .containsKey (sentenceTokenIndex - 1 )) {
394- // set MWT text
395- cl .set (CoreAnnotations .MWTTokenTextAnnotation .class ,
396- sentence .mwtTokens .get (sentence .mwtData .get (sentenceTokenIndex - 1 )));
397- cl .setIsMWT (true );
398- // check if first
399- if (sentence .mwtData .containsKey (sentenceTokenIndex - 2 ) &&
400- sentence .mwtData .get (sentenceTokenIndex -2 ).equals (sentence .mwtData .get (sentenceTokenIndex -1 ))) {
401- cl .setIsMWTFirst (false );
402- } else {
403- cl .setIsMWTFirst (true );
404- }
405- // SpaceAfter / SpacesAfter should only apply to the last word in an MWT
406- // all other words are treated as implicitly having SpaceAfter=No
407- if (sentence .mwtData .containsKey (sentenceTokenIndex ) &&
408- sentence .mwtData .get (sentenceTokenIndex ).equals (sentence .mwtData .get (sentenceTokenIndex -1 ))) {
409- // is there a next word MWT?
410- // and it's the same MWT as this word?
411- // then we aren't last, and SpaceAfter="" is implicitly true
412- cl .setAfter ("" );
413- } else {
414- String miscInfo = sentence .mwtMiscs .get (sentence .mwtData .get (sentenceTokenIndex - 1 ));
415- if (miscInfo != null && !miscInfo .equals ("_" )) {
416- Map <String , String > miscKeyValues = new HashMap <>();
417- Arrays .stream (miscInfo .split ("\\ |" )).forEach (
418- kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
419- String spaceAfter = miscToSpaceAfter (miscKeyValues );
420- cl .setAfter (spaceAfter );
421- } else {
422- cl .setAfter (" " );
423- }
424- }
401+ // SpaceAfter / SpacesAfter should only apply to the last word in an MWT
402+ // all other words are treated as implicitly having SpaceAfter=No
403+ if (sentence .mwtData .containsKey (sentenceTokenIndex ) &&
404+ sentence .mwtData .get (sentenceTokenIndex ).equals (sentence .mwtData .get (sentenceTokenIndex -1 ))) {
405+ // is there a next word MWT?
406+ // and it's the same MWT as this word?
407+ // then we aren't last, and SpaceAfter="" is implicitly true
408+ cl .setAfter ("" );
425409 } else {
426- cl .setIsMWT (false );
427- cl .setIsMWTFirst (false );
428-
429- if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
410+ String miscInfo = sentence .mwtMiscs .get (sentence .mwtData .get (sentenceTokenIndex - 1 ));
411+ if (miscInfo != null && !miscInfo .equals ("_" )) {
430412 Map <String , String > miscKeyValues = new HashMap <>();
431- Arrays .stream (fields . get ( CoNLLU_MiscField ) .split ("\\ |" )).forEach (
413+ Arrays .stream (miscInfo .split ("\\ |" )).forEach (
432414 kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
433415 String spaceAfter = miscToSpaceAfter (miscKeyValues );
434416 cl .setAfter (spaceAfter );
435417 } else {
436418 cl .setAfter (" " );
437419 }
438420 }
439- sentenceTokenIndex ++;
421+ } else {
422+ cl .setIsMWT (false );
423+ cl .setIsMWTFirst (false );
424+
425+ if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
426+ Map <String , String > miscKeyValues = new HashMap <>();
427+ Arrays .stream (fields .get (CoNLLU_MiscField ).split ("\\ |" )).forEach (
428+ kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
429+ String spaceAfter = miscToSpaceAfter (miscKeyValues );
430+ cl .setAfter (spaceAfter );
431+ } else {
432+ cl .setAfter (" " );
433+ }
434+ }
435+ return cl ;
436+ }
437+
438+ /**
439+ * Convert a list of CoNLL-U token lines into a sentence CoreMap
440+ **/
441+ public CoreMap convertCoNLLUSentenceToCoreMap (CoNLLUDocument doc , CoNLLUSentence sentence ) {
442+ List <String > lines = sentence .tokenLines ;
443+ // create CoreLabels
444+ List <CoreLabel > coreLabels = new ArrayList <CoreLabel >();
445+ for (String line : lines ) {
446+ CoreLabel cl = convertLineToCoreLabel (sentence , line );
440447 coreLabels .add (cl );
441448 }
442449 // the last token should have a newline after
0 commit comments