@@ -25,8 +25,8 @@ public class CoNLLUReader {
2525 **/
2626 // TODO: read sent_id?
2727 // TODO: read comments in general
28- // TODO: SpacesBefore on the first token should be checked
2928 // TODO: reconsider the newline as the after on the last word
29+ // TODO: keep around the rest of the misc annotations
3030 public static final int CoNLLU_IndexField = 0 ;
3131 public static final int CoNLLU_WordField = 1 ;
3232 public static final int CoNLLU_LemmaField = 2 ;
@@ -408,6 +408,19 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
408408 cl .set (extraColumns .get (extraColumnIdx ), fields .get (extraColumnIdx ));
409409 }
410410
411+ Map <String , String > miscKeyValues = new HashMap <>();
412+ if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
413+ Arrays .stream (fields .get (CoNLLU_MiscField ).split ("\\ |" )).forEach (
414+ kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
415+ }
416+
417+ // SpacesBefore on a word that isn't the first in a document will
418+ // be replaced with the SpacesAfter from the previous token later
419+ String spacesBefore = miscKeyValues .get ("SpacesBefore" );
420+ if (spacesBefore != null ) {
421+ cl .setBefore (unescapeSpacesAfter (spacesBefore ));
422+ }
423+
411424 // handle the MWT info and after text
412425 if (isEmpty ) {
413426 // don't set an after for empty tokens
@@ -437,10 +450,10 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
437450 } else {
438451 String miscInfo = sentence .mwtMiscs .get (sentence .mwtData .get (sentenceTokenIndex - 1 ));
439452 if (miscInfo != null && !miscInfo .equals ("_" )) {
440- Map <String , String > miscKeyValues = new HashMap <>();
453+ Map <String , String > mwtKeyValues = new HashMap <>();
441454 Arrays .stream (miscInfo .split ("\\ |" )).forEach (
442- kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
443- String spaceAfter = miscToSpaceAfter (miscKeyValues );
455+ kv -> mwtKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
456+ String spaceAfter = miscToSpaceAfter (mwtKeyValues );
444457 cl .setAfter (spaceAfter );
445458 } else {
446459 cl .setAfter (" " );
@@ -450,15 +463,8 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
450463 cl .setIsMWT (false );
451464 cl .setIsMWTFirst (false );
452465
453- if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
454- Map <String , String > miscKeyValues = new HashMap <>();
455- Arrays .stream (fields .get (CoNLLU_MiscField ).split ("\\ |" )).forEach (
456- kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
457- String spaceAfter = miscToSpaceAfter (miscKeyValues );
458- cl .setAfter (spaceAfter );
459- } else {
460- cl .setAfter (" " );
461- }
466+ String spaceAfter = miscToSpaceAfter (miscKeyValues );
467+ cl .setAfter (spaceAfter );
462468 }
463469 return cl ;
464470 }
@@ -477,7 +483,9 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
477483 // the last token should have a newline after
478484 coreLabels .get (coreLabels .size () - 1 ).setAfter (System .lineSeparator ());
479485 // set before
480- coreLabels .get (0 ).setBefore ("" );
486+ if (!coreLabels .get (0 ).containsKey (CoreAnnotations .BeforeAnnotation .class )) {
487+ coreLabels .get (0 ).setBefore ("" );
488+ }
481489 for (int i = 1 ; i < coreLabels .size () ; i ++) {
482490 // all words should match the after of the previous token
483491 coreLabels .get (i ).setBefore (coreLabels .get (i - 1 ).after ());
0 commit comments