@@ -27,7 +27,7 @@ public class CoNLLUReader {
2727 // doing that requires processing the empty nodes somehow
2828 // TODO: read sent_id?
2929 // TODO: read comments in general
30- // TODO: MWT should have after/before set to ""
30+ // TODO: SpacesBefore on the first token should be checked
3131 // TODO: reconsider the newline as the after on the last word
3232 public static final int CoNLLU_IndexField = 0 ;
3333 public static final int CoNLLU_WordField = 1 ;
@@ -405,28 +405,6 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
405405 }
406406 cl .setIndex (sentenceTokenIndex );
407407
408- /*
409- * analyze MISC field for this token
410- *
411- * MISC should be a "|" separated list in the final column
412- *
413- * example: SpaceAfter=No|NER=PERSON
414- *
415- * supported keys:
416- *
417- * - SpaceAfter (e.g. No if next token is punctuation mark)
418- *
419- */
420- if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
421- Map <String , String > miscKeyValues = new HashMap <>();
422- Arrays .stream (fields .get (CoNLLU_MiscField ).split ("\\ |" )).forEach (
423- kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
424- String spaceAfter = miscToSpaceAfter (miscKeyValues );
425- cl .setAfter (spaceAfter );
426- } else {
427- cl .setAfter (" " );
428- }
429-
430408 // handle the MWT info
431409 if (sentence .mwtData .containsKey (sentenceTokenIndex - 1 )) {
432410 // set MWT text
@@ -440,22 +418,39 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
440418 } else {
441419 cl .setIsMWTFirst (true );
442420 }
443- // handle MISC info
444- // TODO: only do SpaceAfter/SpacesAfter for the last one
445- // other MWT words should have after==""
446- String miscInfo = sentence .mwtMiscs .get (sentence .mwtData .get (sentenceTokenIndex - 1 ));
447- if (miscInfo != null && !miscInfo .equals ("_" )) {
421+ // SpaceAfter / SpacesAfter should only apply to the last word in an MWT
422+ // all other words are treated as implicitly having SpaceAfter=No
423+ if (sentence .mwtData .containsKey (sentenceTokenIndex ) &&
424+ sentence .mwtData .get (sentenceTokenIndex ).equals (sentence .mwtData .get (sentenceTokenIndex -1 ))) {
425+ // is there a next word MWT?
426+ // and it's the same MWT as this word?
427+ // then we aren't last, and SpaceAfter="" is implicitly true
428+ cl .setAfter ("" );
429+ } else {
430+ String miscInfo = sentence .mwtMiscs .get (sentence .mwtData .get (sentenceTokenIndex - 1 ));
431+ if (miscInfo != null && !miscInfo .equals ("_" )) {
432+ Map <String , String > miscKeyValues = new HashMap <>();
433+ Arrays .stream (miscInfo .split ("\\ |" )).forEach (
434+ kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
435+ String spaceAfter = miscToSpaceAfter (miscKeyValues );
436+ cl .setAfter (spaceAfter );
437+ } else {
438+ cl .setAfter (" " );
439+ }
440+ }
441+ } else {
442+ cl .setIsMWT (false );
443+ cl .setIsMWTFirst (false );
444+
445+ if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
448446 Map <String , String > miscKeyValues = new HashMap <>();
449- Arrays .stream (miscInfo .split ("\\ |" )).forEach (
447+ Arrays .stream (fields . get ( CoNLLU_MiscField ) .split ("\\ |" )).forEach (
450448 kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
451449 String spaceAfter = miscToSpaceAfter (miscKeyValues );
452450 cl .setAfter (spaceAfter );
453451 } else {
454452 cl .setAfter (" " );
455453 }
456- } else {
457- cl .setIsMWT (false );
458- cl .setIsMWTFirst (false );
459454 }
460455 sentenceTokenIndex ++;
461456 coreLabels .add (cl );
@@ -465,16 +460,8 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
465460 // set before
466461 coreLabels .get (0 ).setBefore ("" );
467462 for (int i = 1 ; i < coreLabels .size () ; i ++) {
468- if (coreLabels .get (i ).isMWT () && !coreLabels .get (i ).isMWTFirst ()) {
469- // if an MWT derived token and NOT the first one, match before of
470- // previous ; MWT derived tokens should have same char offsets,
471- // before, and after of the original token before splitting
472- coreLabels .get (i ).setBefore (coreLabels .get (i -1 ).before ());
473- } else {
474- // standard tokens and first derived token from an MWT
475- // should set before to match after of previous token
476- coreLabels .get (i ).setBefore (coreLabels .get (i - 1 ).after ());
477- }
463+ // all words should match the after of the previous token
464+ coreLabels .get (i ).setBefore (coreLabels .get (i - 1 ).after ());
478465 }
479466 // handle MWT tokens and build the final sentence text
480467 int sentenceCharBegin = doc .docText .length ();
0 commit comments