Skip to content

Commit 01302ee

Browse files
committed
Process SpaceAfter/SpacesAfter on only the last token of an MWT. All others are automatically set to ''
1 parent b2a7645 commit 01302ee

File tree

2 files changed

+34
-44
lines changed

2 files changed

+34
-44
lines changed

itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,20 +193,23 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
193193
assertEquals("\n", token.after());
194194
} else if (j == tokens.size() - 2) {
195195
assertEquals("", token.after());
196+
} else if (i == 0 && j == 13) {
197+
assertEquals("", token.after());
196198
} else {
197-
// TODO: after() should be "" for an MWT
198-
// it just doesn't get marked on the CoNLLU
199199
assertEquals(" ", token.after());
200200
}
201201

202202
if (i == 0 && j == 2) {
203203
assertEquals(" ", token.before());
204204
} else if (i == 0 && j == 0) {
205+
// TODO: is it properly reading the SpacesBefore on the first token?
205206
assertEquals("", token.before());
206207
} else if (j == 0) {
207208
assertEquals("\n", token.before());
208209
} else if (j == tokens.size() - 1) {
209210
assertEquals("", token.before());
211+
} else if (i == 0 && j == 14) {
212+
assertEquals("", token.before());
210213
} else {
211214
assertEquals(" ", token.before());
212215
}

src/edu/stanford/nlp/pipeline/CoNLLUReader.java

Lines changed: 29 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public class CoNLLUReader {
2727
// doing that requires processing the empty nodes somehow
2828
// TODO: read sent_id?
2929
// TODO: read comments in general
30-
// TODO: MWT should have after/before set to ""
30+
// TODO: SpacesBefore on the first token should be checked
3131
// TODO: reconsider the newline as the after on the last word
3232
public static final int CoNLLU_IndexField = 0;
3333
public static final int CoNLLU_WordField = 1;
@@ -405,28 +405,6 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
405405
}
406406
cl.setIndex(sentenceTokenIndex);
407407

408-
/*
409-
* analyze MISC field for this token
410-
*
411-
* MISC should be a "|" separated list in the final column
412-
*
413-
* example: SpaceAfter=No|NER=PERSON
414-
*
415-
* supported keys:
416-
*
417-
* - SpaceAfter (e.g. No if next token is punctuation mark)
418-
*
419-
*/
420-
if (!fields.get(CoNLLU_MiscField).equals("_")) {
421-
Map<String, String> miscKeyValues = new HashMap<>();
422-
Arrays.stream(fields.get(CoNLLU_MiscField).split("\\|")).forEach(
423-
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
424-
String spaceAfter = miscToSpaceAfter(miscKeyValues);
425-
cl.setAfter(spaceAfter);
426-
} else {
427-
cl.setAfter(" ");
428-
}
429-
430408
// handle the MWT info
431409
if (sentence.mwtData.containsKey(sentenceTokenIndex - 1)) {
432410
// set MWT text
@@ -440,22 +418,39 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
440418
} else {
441419
cl.setIsMWTFirst(true);
442420
}
443-
// handle MISC info
444-
// TODO: only do SpaceAfter/SpacesAfter for the last one
445-
// other MWT words should have after==""
446-
String miscInfo = sentence.mwtMiscs.get(sentence.mwtData.get(sentenceTokenIndex - 1));
447-
if (miscInfo != null && !miscInfo.equals("_")) {
421+
// SpaceAfter / SpacesAfter should only apply to the last word in an MWT
422+
// all other words are treated as implicitly having SpaceAfter=No
423+
if (sentence.mwtData.containsKey(sentenceTokenIndex) &&
424+
sentence.mwtData.get(sentenceTokenIndex).equals(sentence.mwtData.get(sentenceTokenIndex-1))) {
425+
// is there a next word MWT?
426+
// and it's the same MWT as this word?
427+
// then we aren't last, and SpaceAfter="" is implicitly true
428+
cl.setAfter("");
429+
} else {
430+
String miscInfo = sentence.mwtMiscs.get(sentence.mwtData.get(sentenceTokenIndex - 1));
431+
if (miscInfo != null && !miscInfo.equals("_")) {
432+
Map<String, String> miscKeyValues = new HashMap<>();
433+
Arrays.stream(miscInfo.split("\\|")).forEach(
434+
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
435+
String spaceAfter = miscToSpaceAfter(miscKeyValues);
436+
cl.setAfter(spaceAfter);
437+
} else {
438+
cl.setAfter(" ");
439+
}
440+
}
441+
} else {
442+
cl.setIsMWT(false);
443+
cl.setIsMWTFirst(false);
444+
445+
if (!fields.get(CoNLLU_MiscField).equals("_")) {
448446
Map<String, String> miscKeyValues = new HashMap<>();
449-
Arrays.stream(miscInfo.split("\\|")).forEach(
447+
Arrays.stream(fields.get(CoNLLU_MiscField).split("\\|")).forEach(
450448
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
451449
String spaceAfter = miscToSpaceAfter(miscKeyValues);
452450
cl.setAfter(spaceAfter);
453451
} else {
454452
cl.setAfter(" ");
455453
}
456-
} else {
457-
cl.setIsMWT(false);
458-
cl.setIsMWTFirst(false);
459454
}
460455
sentenceTokenIndex++;
461456
coreLabels.add(cl);
@@ -465,16 +460,8 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
465460
// set before
466461
coreLabels.get(0).setBefore("");
467462
for (int i = 1 ; i < coreLabels.size() ; i++) {
468-
if (coreLabels.get(i).isMWT() && !coreLabels.get(i).isMWTFirst()) {
469-
// if an MWT derived token and NOT the first one, match before of
470-
// previous ; MWT derived tokens should have same char offsets,
471-
// before, and after of the original token before splitting
472-
coreLabels.get(i).setBefore(coreLabels.get(i-1).before());
473-
} else {
474-
// standard tokens and first derived token from an MWT
475-
// should set before to match after of previous token
476-
coreLabels.get(i).setBefore(coreLabels.get(i - 1).after());
477-
}
463+
// all words should match the after of the previous token
464+
coreLabels.get(i).setBefore(coreLabels.get(i - 1).after());
478465
}
479466
// handle MWT tokens and build the final sentence text
480467
int sentenceCharBegin = doc.docText.length();

0 commit comments

Comments
 (0)