@@ -108,6 +108,79 @@ public CoNLLUReader(Properties props) throws ClassNotFoundException {
108108 columnCount += extraColumns .size ();
109109 }
110110
111+ // TODO: is there a better place for this?
112+ public static String unescapeSpacesAfter (String escaped ) {
113+ int idx = 0 ;
114+ StringBuilder unescaped = new StringBuilder ();
115+ while (idx < escaped .length ()) {
116+ if (escaped .charAt (idx ) != '\\' ) {
117+ unescaped .append (escaped .charAt (idx ));
118+ ++idx ;
119+ continue ;
120+ }
121+ if (idx + 2 <= escaped .length ()) {
122+ String piece = escaped .substring (idx , idx + 2 );
123+ if (piece .equals ("\\ s" )) {
124+ unescaped .append (' ' );
125+ idx += 2 ;
126+ continue ;
127+ } else if (piece .equals ("\\ t" )) {
128+ unescaped .append ('\t' );
129+ idx += 2 ;
130+ continue ;
131+ } else if (piece .equals ("\\ r" )) {
132+ unescaped .append ('\r' );
133+ idx += 2 ;
134+ continue ;
135+ } else if (piece .equals ("\\ n" )) {
136+ unescaped .append ('\n' );
137+ idx += 2 ;
138+ continue ;
139+ } else if (piece .equals ("\\ p" )) {
140+ unescaped .append ('|' );
141+ idx += 2 ;
142+ continue ;
143+ } else if (piece .equals ("\\ \\ " )) {
144+ unescaped .append ('\\' );
145+ idx += 2 ;
146+ continue ;
147+ }
148+ }
149+ if (idx + 6 <= escaped .length ()) {
150+ String piece = escaped .substring (idx , idx + 6 );
151+ if (piece .equals ("\\ u00A0" )) {
152+ unescaped .append (' ' );
153+ idx += 6 ;
154+ continue ;
155+ }
156+ }
157+ unescaped .append (escaped .charAt (idx ));
158+ ++idx ;
159+ }
160+ return unescaped .toString ();
161+ }
162+
163+ public static String miscToSpaceAfter (Map <String , String > miscKeyValues ) {
164+ String spaceAfter = miscKeyValues .get ("SpaceAfter" );
165+ if (spaceAfter != null ) {
166+ if (spaceAfter .equals ("No" ) || spaceAfter .equals ("no" )) {
167+ return "" ;
168+ } else if (spaceAfter .equals ("No~" )) {
169+ // a random data bug in UD 2.11 Russian-Taiga
170+ return "" ;
171+ } else {
172+ return " " ;
173+ }
174+ }
175+
176+ String spacesAfter = miscKeyValues .get ("SpacesAfter" );
177+ if (spacesAfter != null ) {
178+ return unescapeSpacesAfter (spacesAfter );
179+ }
180+
181+ return " " ;
182+ }
183+
111184 /**
112185 * class to store info for a CoNLL-U document
113186 **/
@@ -345,15 +418,11 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
345418 *
346419 */
347420 if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
348- HashMap <String , String > miscKeyValues = new HashMap <>();
421+ Map <String , String > miscKeyValues = new HashMap <>();
349422 Arrays .stream (fields .get (CoNLLU_MiscField ).split ("\\ |" )).forEach (
350- kv -> miscKeyValues .put (kv .split ("=" )[0 ], kv .split ("=" )[1 ]));
351- // unless SpaceAfter=No, add a space after this token
352- if (!miscKeyValues .getOrDefault ("SpaceAfter" , "Yes" ).equals ("No" )) {
353- cl .setAfter (" " );
354- } else {
355- cl .setAfter ("" );
356- }
423+ kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
424+ String spaceAfter = miscToSpaceAfter (miscKeyValues );
425+ cl .setAfter (spaceAfter );
357426 } else {
358427 cl .setAfter (" " );
359428 }
@@ -372,11 +441,17 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
372441 cl .setIsMWTFirst (true );
373442 }
374443 // handle MISC info
444+ // TODO: only do SpaceAfter/SpacesAfter for the last one
445+ // other MWT words should have after==""
375446 String miscInfo = sentence .mwtMiscs .get (sentence .mwtData .get (sentenceTokenIndex - 1 ));
376- for (String miscKV : miscInfo .split ("\\ |" )) {
377- if (miscKV .startsWith ("SpaceAfter" )) {
378- cl .setAfter (miscKV .split ("=" )[1 ].equals ("No" ) ? "" : " " );
379- }
447+ if (miscInfo != null && !miscInfo .equals ("_" )) {
448+ Map <String , String > miscKeyValues = new HashMap <>();
449+ Arrays .stream (miscInfo .split ("\\ |" )).forEach (
450+ kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
451+ String spaceAfter = miscToSpaceAfter (miscKeyValues );
452+ cl .setAfter (spaceAfter );
453+ } else {
454+ cl .setAfter (" " );
380455 }
381456 } else {
382457 cl .setIsMWT (false );
0 commit comments