Skip to content

Commit 0d1e056

Browse files
authored
Merge pull request #1414 from dkpro/bugfix/1413-CoNLL-U-export-broken-if-text-contains-line-breaks
#1413 - CoNLL-U export broken if text contains line breaks
2 parents 42498c5 + 1c99ee2 commit 0d1e056

File tree

3 files changed

+77
-1
lines changed

3 files changed

+77
-1
lines changed

dkpro-core-io-conll-asl/src/main/java/org/dkpro/core/io/conll/ConllUWriter.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,11 @@ private void convert(JCas aJCas, PrintWriter aOut)
169169
aOut.printf("# %s = %s\n", ConllUReader.META_SEND_ID, sentence.getId());
170170
}
171171
if (writeTextHeader) {
172-
aOut.printf("# %s = %s\n", ConllUReader.META_TEXT, sentence.getCoveredText());
172+
String sentenceText = sentence.getCoveredText();
173+
// CoNLL-U does not support line breaks in the sentence text, so we need to replace
174+
// such characters.
175+
sentenceText = StringUtils.replaceChars(sentenceText, "\n\r", " ");
176+
aOut.printf("# %s = %s\n", ConllUReader.META_TEXT, sentenceText);
173177
}
174178

175179
// Tokens
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright 2019
3+
* Ubiquitous Knowledge Processing (UKP) Lab
4+
* Technische Universität Darmstadt
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.dkpro.core.io.conll;
19+
20+
import static org.apache.commons.io.FileUtils.readFileToString;
21+
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
22+
import static org.assertj.core.api.Assertions.assertThat;
23+
24+
import java.io.File;
25+
26+
import org.apache.uima.analysis_engine.AnalysisEngine;
27+
import org.apache.uima.fit.factory.JCasFactory;
28+
import org.apache.uima.jcas.JCas;
29+
import org.dkpro.core.testing.DkproTestContext;
30+
import org.junit.Rule;
31+
import org.junit.Test;
32+
33+
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
34+
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
35+
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
36+
37+
public class ConllUWriterTest
38+
{
39+
@Test
40+
public void thatLineBreaksDoNotBreakTheFormat() throws Exception
41+
{
42+
File target = testContext.getTestOutputFolder();
43+
44+
JCas jcas = JCasFactory.createText("Test\ntest.");
45+
new Sentence(jcas, 0, 10).addToIndexes();
46+
new Token(jcas, 0, 4).addToIndexes();
47+
new Token(jcas, 5, 9).addToIndexes();
48+
new Token(jcas, 9, 10).addToIndexes();
49+
50+
DocumentMetaData dmd = DocumentMetaData.create(jcas);
51+
dmd.setDocumentId("output");
52+
53+
AnalysisEngine writer = createEngine(ConllUWriter.class,
54+
ConllUWriter.PARAM_TARGET_LOCATION, target);
55+
56+
writer.process(jcas);
57+
58+
String reference = readFileToString(
59+
new File("src/test/resources/conll/u_v2/conllu-linebreaks.conll"), "UTF-8").trim();
60+
String actual = readFileToString(new File(target, "output.conll"), "UTF-8").trim();
61+
62+
assertThat(actual).isEqualToNormalizingNewlines(reference);
63+
}
64+
65+
@Rule
66+
public DkproTestContext testContext = new DkproTestContext();
67+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# text = Test test.
2+
1 Test _ _ _ _ _ _ _ _
3+
2 test _ _ _ _ _ _ _ SpaceAfter=No
4+
3 . _ _ _ _ _ _ _ _
5+

0 commit comments

Comments
 (0)