@@ -24,7 +24,6 @@ public class CoNLLUReader {
2424 * field constants
2525 **/
2626 // TODO: we should handle field 8, DEPS, for an enhanced dependencies
27- // doing that requires processing the empty nodes somehow
2827 // TODO: read sent_id?
2928 // TODO: read comments in general
3029 // TODO: SpacesBefore on the first token should be checked
@@ -48,6 +47,7 @@ public class CoNLLUReader {
4847 public static Pattern DOCUMENT_LINE = Pattern .compile ("^# newdoc" );
4948 public static Pattern MWT_LINE = Pattern .compile ("^[0-9]+-[0-9]+.*" );
5049 public static Pattern TOKEN_LINE = Pattern .compile ("^[0-9]+\t .*" );
50+ public static Pattern EMPTY_LINE = Pattern .compile ("^[0-9]+[.][0-9]+\t .*" );
5151
5252 /**
5353 * shorthands for CoreAnnotations
@@ -219,6 +219,8 @@ public class CoNLLUSentence {
219219
220220 // the token lines
221221 public List <String > tokenLines = new ArrayList <>();
222+ // in case the enhanced dependencies have empty words
223+ public List <String > emptyLines = new ArrayList <>();
222224 // data for the sentence contained in # key values
223225 public HashMap <String , String > sentenceData = new HashMap <>();
224226 // map indices in token list to mwt data if there is any
@@ -240,8 +242,9 @@ else if (MWT_LINE.matcher(line).matches())
240242 addMWTData (line );
241243 else if (TOKEN_LINE .matcher (line ).matches ())
242244 tokenLines .add (line );
245+ else if (EMPTY_LINE .matcher (line ).matches ())
246+ emptyLines .add (line );
243247 else
244- // TODO: this is ignoring "empty" tokens
245248 return true ;
246249 return false ;
247250 }
@@ -359,7 +362,23 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
359362 public CoreLabel convertLineToCoreLabel (CoNLLUSentence sentence , String line ) {
360363 List <String > fields = Arrays .asList (line .split ("\t " ));
361364 CoreLabel cl = new CoreLabel ();
362- int sentenceTokenIndex = Integer .valueOf (fields .get (CoNLLU_IndexField ));
365+
366+ String indexField = fields .get (CoNLLU_IndexField );
367+ int sentenceTokenIndex ;
368+ boolean isEmpty ;
369+ if (indexField .indexOf ('.' ) >= 0 ) {
370+ isEmpty = true ;
371+ String [] indexPieces = indexField .split ("[.]" , 2 );
372+ sentenceTokenIndex = Integer .valueOf (indexPieces [0 ]);
373+ cl .setIndex (sentenceTokenIndex );
374+ int emptyIndex = Integer .valueOf (indexPieces [1 ]);
375+ cl .set (CoreAnnotations .EmptyIndexAnnotation .class , emptyIndex );
376+ } else {
377+ isEmpty = false ;
378+ sentenceTokenIndex = Integer .valueOf (indexField );
379+ cl .setIndex (sentenceTokenIndex );
380+ }
381+
363382 cl .setWord (fields .get (CoNLLU_WordField ));
364383 cl .setValue (fields .get (CoNLLU_WordField ));
365384 cl .setOriginalText (fields .get (CoNLLU_WordField ));
@@ -383,10 +402,14 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
383402 extraColumnIdx ++) {
384403 cl .set (extraColumns .get (extraColumnIdx ), fields .get (extraColumnIdx ));
385404 }
386- cl .setIndex (sentenceTokenIndex );
387405
388- // handle the MWT info
389- if (sentence .mwtData .containsKey (sentenceTokenIndex - 1 )) {
406+ // handle the MWT info and after text
407+ if (isEmpty ) {
408+ // don't set an after for empty tokens
409+ // empty tokens are not considered part of MWT
410+ cl .setIsMWT (false );
411+ cl .setIsMWTFirst (false );
412+ } else if (sentence .mwtData .containsKey (sentenceTokenIndex - 1 )) {
390413 // set MWT text
391414 cl .set (CoreAnnotations .MWTTokenTextAnnotation .class ,
392415 sentence .mwtTokens .get (sentence .mwtData .get (sentenceTokenIndex - 1 )));
@@ -487,6 +510,12 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
487510 }
488511 }
489512
513+ List <CoreLabel > emptyLabels = new ArrayList <CoreLabel >();
514+ for (String line : sentence .emptyLines ) {
515+ CoreLabel cl = convertLineToCoreLabel (sentence , line );
516+ emptyLabels .add (cl );
517+ }
518+
490519 // build SemanticGraphEdges
491520 List <SemanticGraphEdge > graphEdges = new ArrayList <>();
492521 for (int i = 0 ; i < lines .size (); i ++) {
@@ -505,6 +534,11 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
505534 Annotation sentenceCoreMap = new Annotation (doc .docText .substring (sentenceCharBegin ).trim ());
506535 // add tokens
507536 sentenceCoreMap .set (CoreAnnotations .TokensAnnotation .class , coreLabels );
537+ // add empty tokens, if any exist
538+ if (emptyLabels .size () > 0 ) {
539+ sentenceCoreMap .set (CoreAnnotations .EmptyTokensAnnotation .class , emptyLabels );
540+ }
541+
508542 // add dependency graph
509543 sentenceCoreMap .set (SemanticGraphCoreAnnotations .BasicDependenciesAnnotation .class , depParse );
510544 return sentenceCoreMap ;
0 commit comments