Skip to content

Commit b2a7645

Browse files
committed
Process SpacesAfter as well as SpaceAfter in the CoNLLUReader
1 parent 3d57346 commit b2a7645

File tree

3 files changed

+97
-19
lines changed

3 files changed

+97
-19
lines changed

data/edu/stanford/nlp/pipeline/es-example.conllu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# sent_id = 3LB-CAST-a22-0-s10
2-
# text = Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.
2+
# text = Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.
33
# orig_file_sentence 001#48
4+
# Note = This sentence is from the dev set, but we manually updated word 2 to have SpacesAfter=\s\s
45
1 Pero pero CCONJ cc _ 14 advmod 14:advmod _
5-
2 la el DET da0fs0 Definite=Def|Gender=Fem|Number=Sing|PronType=Art 3 det 3:det _
6+
2 la el DET da0fs0 Definite=Def|Gender=Fem|Number=Sing|PronType=Art 3 det 3:det SpacesAfter=\s\s
67
3 existencia existencia NOUN ncfs000 Gender=Fem|Number=Sing 14 nsubj 14:nsubj ArgTem=arg1:tem
78
4 de de ADP sps00 _ 7 case 7:case _
89
5 dos dos NUM dn0cp0 Number=Plur|NumForm=Word|NumType=Card 7 nummod 7:nummod _

itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,13 @@
2222
*/
2323
public class CoNLLUReaderITest {
2424

25-
public String exampleDocument = "Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un " +
26-
"descuido de fábrica.\nDe allí las rebajas.\n";
2725
public String examplePath = String.format("edu/stanford/nlp/pipeline/es-example.conllu");
2826
public StanfordCoreNLP pipeline;
2927
public Annotation goldDocument;
3028
public Annotation readInDocument;
3129

3230
static final String[] EXPECTED_SENTENCE_TEXT = {
33-
"Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.",
31+
"Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.",
3432
"De allí las rebajas."
3533
};
3634
static final String EXPECTED_TEXT = String.join(System.lineSeparator(), EXPECTED_SENTENCE_TEXT) + System.lineSeparator();
@@ -189,7 +187,9 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
189187
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
190188
for (int j = 0; j < tokens.size(); ++j) {
191189
CoreLabel token = tokens.get(j);
192-
if (j == tokens.size() - 1) {
190+
if (i == 0 && j == 1) {
191+
assertEquals(" ", token.after());
192+
} else if (j == tokens.size() - 1) {
193193
assertEquals("\n", token.after());
194194
} else if (j == tokens.size() - 2) {
195195
assertEquals("", token.after());
@@ -199,7 +199,9 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
199199
assertEquals(" ", token.after());
200200
}
201201

202-
if (i == 0 && j == 0) {
202+
if (i == 0 && j == 2) {
203+
assertEquals(" ", token.before());
204+
} else if (i == 0 && j == 0) {
203205
assertEquals("", token.before());
204206
} else if (j == 0) {
205207
assertEquals("\n", token.before());

src/edu/stanford/nlp/pipeline/CoNLLUReader.java

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,79 @@ public CoNLLUReader(Properties props) throws ClassNotFoundException {
108108
columnCount += extraColumns.size();
109109
}
110110

111+
// TODO: is there a better place for this?
112+
public static String unescapeSpacesAfter(String escaped) {
113+
int idx = 0;
114+
StringBuilder unescaped = new StringBuilder();
115+
while (idx < escaped.length()) {
116+
if (escaped.charAt(idx) != '\\') {
117+
unescaped.append(escaped.charAt(idx));
118+
++idx;
119+
continue;
120+
}
121+
if (idx + 2 <= escaped.length()) {
122+
String piece = escaped.substring(idx, idx + 2);
123+
if (piece.equals("\\s")) {
124+
unescaped.append(' ');
125+
idx += 2;
126+
continue;
127+
} else if (piece.equals("\\t")) {
128+
unescaped.append('\t');
129+
idx += 2;
130+
continue;
131+
} else if (piece.equals("\\r")) {
132+
unescaped.append('\r');
133+
idx += 2;
134+
continue;
135+
} else if (piece.equals("\\n")) {
136+
unescaped.append('\n');
137+
idx += 2;
138+
continue;
139+
} else if (piece.equals("\\p")) {
140+
unescaped.append('|');
141+
idx += 2;
142+
continue;
143+
} else if (piece.equals("\\\\")) {
144+
unescaped.append('\\');
145+
idx += 2;
146+
continue;
147+
}
148+
}
149+
if (idx + 6 <= escaped.length()) {
150+
String piece = escaped.substring(idx, idx + 6);
151+
if (piece.equals("\\u00A0")) {
152+
unescaped.append(' ');
153+
idx += 6;
154+
continue;
155+
}
156+
}
157+
unescaped.append(escaped.charAt(idx));
158+
++idx;
159+
}
160+
return unescaped.toString();
161+
}
162+
163+
public static String miscToSpaceAfter(Map<String, String> miscKeyValues) {
164+
String spaceAfter = miscKeyValues.get("SpaceAfter");
165+
if (spaceAfter != null) {
166+
if (spaceAfter.equals("No") || spaceAfter.equals("no")) {
167+
return "";
168+
} else if (spaceAfter.equals("No~")) {
169+
// a random data bug in UD 2.11 Russian-Taiga
170+
return "";
171+
} else {
172+
return " ";
173+
}
174+
}
175+
176+
String spacesAfter = miscKeyValues.get("SpacesAfter");
177+
if (spacesAfter != null) {
178+
return unescapeSpacesAfter(spacesAfter);
179+
}
180+
181+
return " ";
182+
}
183+
111184
/**
112185
* class to store info for a CoNLL-U document
113186
**/
@@ -345,15 +418,11 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
345418
*
346419
*/
347420
if (!fields.get(CoNLLU_MiscField).equals("_")) {
348-
HashMap<String, String> miscKeyValues = new HashMap<>();
421+
Map<String, String> miscKeyValues = new HashMap<>();
349422
Arrays.stream(fields.get(CoNLLU_MiscField).split("\\|")).forEach(
350-
kv -> miscKeyValues.put(kv.split("=")[0], kv.split("=")[1]));
351-
// unless SpaceAfter=No, add a space after this token
352-
if (!miscKeyValues.getOrDefault("SpaceAfter", "Yes").equals("No")) {
353-
cl.setAfter(" ");
354-
} else {
355-
cl.setAfter("");
356-
}
423+
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
424+
String spaceAfter = miscToSpaceAfter(miscKeyValues);
425+
cl.setAfter(spaceAfter);
357426
} else {
358427
cl.setAfter(" ");
359428
}
@@ -372,11 +441,17 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
372441
cl.setIsMWTFirst(true);
373442
}
374443
// handle MISC info
444+
// TODO: only do SpaceAfter/SpacesAfter for the last one
445+
// other MWT words should have after==""
375446
String miscInfo = sentence.mwtMiscs.get(sentence.mwtData.get(sentenceTokenIndex - 1));
376-
for (String miscKV : miscInfo.split("\\|")) {
377-
if (miscKV.startsWith("SpaceAfter")) {
378-
cl.setAfter(miscKV.split("=")[1].equals("No") ? "" : " ");
379-
}
447+
if (miscInfo != null && !miscInfo.equals("_")) {
448+
Map<String, String> miscKeyValues = new HashMap<>();
449+
Arrays.stream(miscInfo.split("\\|")).forEach(
450+
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
451+
String spaceAfter = miscToSpaceAfter(miscKeyValues);
452+
cl.setAfter(spaceAfter);
453+
} else {
454+
cl.setAfter(" ");
380455
}
381456
} else {
382457
cl.setIsMWT(false);

0 commit comments

Comments
 (0)