@@ -24,8 +24,6 @@ public class CoNLLUReader {
2424 * field constants
2525 **/
2626 // TODO: read sent_id?
27- // TODO: reconsider the newline as the after on the last word
28- // TODO: keep around the rest of the misc annotations
2927 public static final int CoNLLU_IndexField = 0 ;
3028 public static final int CoNLLU_WordField = 1 ;
3129 public static final int CoNLLU_LemmaField = 2 ;
@@ -369,6 +367,24 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
369367 return finalAnnotation ;
370368 }
371369
370+ public static final String rebuildMisc (Map <String , String > miscKeyValues ) {
371+ if (miscKeyValues .size () == 0 ) {
372+ return null ;
373+ }
374+
375+ // rebuild the misc, since we have removed the SpaceAfter, SpacesAfter, and SpacesBefore
376+ StringBuilder misc = new StringBuilder ();
377+ for (Map .Entry <String , String > entry : miscKeyValues .entrySet ()) {
378+ if (misc .length () > 0 ) {
379+ misc .append ("|" );
380+ }
381+ misc .append (entry .getKey ());
382+ misc .append ("=" );
383+ misc .append (entry .getValue ());
384+ }
385+ return misc .toString ();
386+ }
387+
372388 /**
373389 * Convert a single ten column CoNLLU line into a CoreLabel
374390 */
@@ -454,6 +470,12 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
454470 cl .setIsMWTFirst (false );
455471 } else {
456472 cl .setIsMWTFirst (true );
473+
474+ // if we are first, look for SpacesBefore
475+ String mwtSpacesBefore = mwtKeyValues .get ("SpacesBefore" );
476+ if (mwtSpacesBefore != null ) {
477+ cl .setBefore (unescapeSpacesAfter (mwtSpacesBefore ));
478+ }
457479 }
458480 // SpaceAfter / SpacesAfter should only apply to the last word in an MWT
459481 // all other words are treated as implicitly having SpaceAfter=No
@@ -467,6 +489,16 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
467489 String spaceAfter = miscToSpaceAfter (mwtKeyValues );
468490 cl .setAfter (spaceAfter );
469491 }
492+ if (cl .isMWTFirst ()) {
493+ mwtKeyValues .remove ("SpaceAfter" );
494+ mwtKeyValues .remove ("SpacesAfter" );
495+ mwtKeyValues .remove ("SpacesBefore" );
496+
497+ String mwtMisc = rebuildMisc (mwtKeyValues );
498+ if (mwtMisc != null ) {
499+ cl .set (CoreAnnotations .MWTTokenMiscAnnotation .class , mwtMisc );
500+ }
501+ }
470502 } else {
471503 cl .setIsMWT (false );
472504 cl .setIsMWTFirst (false );
@@ -476,18 +508,9 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
476508 }
477509 miscKeyValues .remove ("SpaceAfter" );
478510 miscKeyValues .remove ("SpacesAfter" );
479- if (miscKeyValues .size () > 0 ) {
480- // rebuild the misc, since we have removed the SpaceAfter, SpacesAfter, and SpacesBefore
481- StringBuilder misc = new StringBuilder ();
482- for (Map .Entry <String , String > entry : miscKeyValues .entrySet ()) {
483- if (misc .length () > 0 ) {
484- misc .append ("|" );
485- }
486- misc .append (entry .getKey ());
487- misc .append ("=" );
488- misc .append (entry .getValue ());
489- }
490- cl .set (CoreAnnotations .CoNLLUMisc .class , misc .toString ());
511+ String misc = rebuildMisc (miscKeyValues );
512+ if (misc != null ) {
513+ cl .set (CoreAnnotations .CoNLLUMisc .class , misc );
491514 }
492515 return cl ;
493516 }
0 commit comments