Skip to content

Commit f679a5e

Browse files
authored
OPENNLP-976: Implement GermEval2014 Format (#971)
1 parent 65b800b commit f679a5e

File tree

7 files changed

+1047
-0
lines changed

7 files changed

+1047
-0
lines changed

opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
3131
import opennlp.tools.formats.DocumentSampleStreamFactory;
3232
import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
33+
import opennlp.tools.formats.GermEval2014NameSampleStreamFactory;
3334
import opennlp.tools.formats.LanguageDetectorSampleStreamFactory;
3435
import opennlp.tools.formats.LemmatizerSampleStreamFactory;
3536
import opennlp.tools.formats.NameSampleDataStreamFactory;
@@ -107,6 +108,7 @@ public final class StreamFactoryRegistry {
107108
Conll02NameSampleStreamFactory.registerFactory();
108109
Conll03NameSampleStreamFactory.registerFactory();
109110
EvalitaNameSampleStreamFactory.registerFactory();
111+
GermEval2014NameSampleStreamFactory.registerFactory();
110112
ConllXPOSSampleStreamFactory.registerFactory();
111113
ConllXSentenceSampleStreamFactory.registerFactory();
112114
ConllXTokenSampleStreamFactory.registerFactory();
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.formats;
19+
20+
import java.io.IOException;
21+
import java.nio.charset.StandardCharsets;
22+
import java.util.ArrayList;
23+
import java.util.List;
24+
25+
import opennlp.tools.commons.Internal;
26+
import opennlp.tools.namefind.NameSample;
27+
import opennlp.tools.util.InputStreamFactory;
28+
import opennlp.tools.util.InvalidFormatException;
29+
import opennlp.tools.util.ObjectStream;
30+
import opennlp.tools.util.PlainTextByLineStream;
31+
import opennlp.tools.util.Span;
32+
import opennlp.tools.util.StringUtil;
33+
34+
/**
35+
* Parser for the GermEval 2014 Named Entity Recognition Shared Task data.
36+
* <p>
37+
* The data is in a tab-separated format with four columns:
38+
* <ol>
39+
* <li>Token index (1-based per sentence)</li>
40+
* <li>Token text</li>
41+
* <li>Outer named entity tag (IOB2 scheme)</li>
42+
* <li>Nested/embedded named entity tag (IOB2 scheme)</li>
43+
* </ol>
44+
* Comment lines starting with {@code #} mark document boundaries and contain
45+
* source URL and date metadata. Blank lines separate sentences.
46+
* <p>
47+
* The data uses four main entity types: Person (PER), Location (LOC),
48+
* Organization (ORG) and Other (OTH), with additional {@code deriv} and
49+
* {@code part} suffixes for derived forms and name parts respectively.
50+
* <p>
51+
* Since {@link NameSample} does not support overlapping spans, this stream
52+
* requires selecting either the {@link NerLayer#OUTER outer} or
53+
* {@link NerLayer#INNER inner} annotation layer via a {@link NerLayer} parameter.
54+
* <p>
55+
* Data can be found on
56+
* <a href="https://sites.google.com/site/germeval2014ner/data">this web site</a>.
57+
* <p>
58+
* <b>Note:</b>
59+
* Do not use this class, internal use only!
60+
*/
61+
@Internal
62+
public class GermEval2014NameSampleStream implements ObjectStream<NameSample> {
63+
64+
/**
65+
* Selects which NER annotation layer to read from the GermEval 2014 data.
66+
*/
67+
public enum NerLayer {
68+
/** The outer (top-level) named entity annotations (column 3). */
69+
OUTER,
70+
/** The nested/embedded named entity annotations (column 4). */
71+
INNER
72+
}
73+
74+
public static final int GENERATE_PERSON_ENTITIES = 0x01;
75+
public static final int GENERATE_ORGANIZATION_ENTITIES = 0x01 << 1;
76+
public static final int GENERATE_LOCATION_ENTITIES = 0x01 << 2;
77+
public static final int GENERATE_MISC_ENTITIES = 0x01 << 3;
78+
79+
private final ObjectStream<String> lineStream;
80+
private final int types;
81+
private final NerLayer layer;
82+
83+
/**
84+
* Initializes a {@link GermEval2014NameSampleStream}.
85+
*
86+
* @param lineStream An {@link ObjectStream} over the lines
87+
* in the GermEval 2014 data file.
88+
* @param types The entity types to include in the Name Sample object stream.
89+
* @param layer The {@link NerLayer} to read.
90+
*/
91+
public GermEval2014NameSampleStream(final ObjectStream<String> lineStream,
92+
final int types, final NerLayer layer) {
93+
this.lineStream = lineStream;
94+
this.types = types;
95+
this.layer = layer;
96+
}
97+
98+
/**
99+
* Initializes a {@link GermEval2014NameSampleStream}.
100+
*
101+
* @param in The {@link InputStreamFactory} for the input file.
102+
* @param types The entity types to include in the Name Sample object stream.
103+
* @param layer The {@link NerLayer} to read.
104+
* @throws IOException Thrown if IO errors occurred.
105+
*/
106+
public GermEval2014NameSampleStream(final InputStreamFactory in, final int types,
107+
final NerLayer layer) throws IOException {
108+
this(new PlainTextByLineStream(in, StandardCharsets.UTF_8), types, layer);
109+
}
110+
111+
static Span extract(final int begin, final int end, final String beginTag)
112+
throws InvalidFormatException {
113+
114+
final String type = mapTagToType(beginTag);
115+
return new Span(begin, end, type);
116+
}
117+
118+
private static String mapTagToType(final String tag) throws InvalidFormatException {
119+
// Strip B- or I- prefix
120+
final String rawType = tag.substring(2);
121+
122+
return switch (rawType) {
123+
case "PER" -> "person";
124+
case "PERderiv" -> "personderiv";
125+
case "PERpart" -> "personpart";
126+
case "LOC" -> "location";
127+
case "LOCderiv" -> "locationderiv";
128+
case "LOCpart" -> "locationpart";
129+
case "ORG" -> "organization";
130+
case "ORGderiv" -> "organizationderiv";
131+
case "ORGpart" -> "organizationpart";
132+
case "OTH" -> "misc";
133+
case "OTHderiv" -> "miscderiv";
134+
case "OTHpart" -> "miscpart";
135+
default -> throw new InvalidFormatException("Unknown type: " + rawType);
136+
};
137+
}
138+
139+
private boolean isTypeEnabled(final String tag) {
140+
if (tag.startsWith("B-PER") || tag.startsWith("I-PER")) {
141+
return (types & GENERATE_PERSON_ENTITIES) != 0;
142+
}
143+
if (tag.startsWith("B-ORG") || tag.startsWith("I-ORG")) {
144+
return (types & GENERATE_ORGANIZATION_ENTITIES) != 0;
145+
}
146+
if (tag.startsWith("B-LOC") || tag.startsWith("I-LOC")) {
147+
return (types & GENERATE_LOCATION_ENTITIES) != 0;
148+
}
149+
if (tag.startsWith("B-OTH") || tag.startsWith("I-OTH")) {
150+
return (types & GENERATE_MISC_ENTITIES) != 0;
151+
}
152+
return tag.equals("O");
153+
}
154+
155+
private List<Span> convertTagsToSpans(final List<String> tags) throws IOException {
156+
final List<Span> names = new ArrayList<>();
157+
158+
int beginIndex = -1;
159+
int endIndex = -1;
160+
161+
for (int i = 0; i < tags.size(); i++) {
162+
String tag = tags.get(i);
163+
164+
if (!tag.equals("O") && !isTypeEnabled(tag)) {
165+
tag = "O";
166+
}
167+
168+
if (tag.startsWith("B-")) {
169+
if (beginIndex != -1) {
170+
names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
171+
}
172+
beginIndex = i;
173+
endIndex = i + 1;
174+
} else if (tag.startsWith("I-")) {
175+
endIndex++;
176+
} else if (tag.equals("O")) {
177+
if (beginIndex != -1) {
178+
names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
179+
beginIndex = -1;
180+
endIndex = -1;
181+
}
182+
} else {
183+
throw new IOException("Invalid tag: " + tag);
184+
}
185+
}
186+
187+
// if one span remains, create it here
188+
if (beginIndex != -1) {
189+
names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
190+
}
191+
192+
return names;
193+
}
194+
195+
@Override
196+
public NameSample read() throws IOException {
197+
198+
final List<String> sentence = new ArrayList<>();
199+
final List<String> outerTags = new ArrayList<>();
200+
final List<String> innerTags = new ArrayList<>();
201+
202+
boolean isClearAdaptiveData = false;
203+
204+
// Empty line indicates end of sentence
205+
String line;
206+
while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) {
207+
208+
// Comment lines starting with # mark document boundaries
209+
if (line.startsWith("#")) {
210+
isClearAdaptiveData = true;
211+
continue;
212+
}
213+
214+
final String[] fields = line.split("\t");
215+
216+
if (fields.length >= 4) {
217+
sentence.add(fields[1]);
218+
outerTags.add(fields[2]);
219+
innerTags.add(fields[3].trim());
220+
} else {
221+
throw new IOException("Expected at least four tab-separated fields per line "
222+
+ "in GermEval 2014 data, got " + fields.length + " for line '" + line + "'!");
223+
}
224+
}
225+
226+
if (sentence.size() > 0) {
227+
final List<String> selectedTags = (layer == NerLayer.OUTER) ? outerTags : innerTags;
228+
final List<Span> names = convertTagsToSpans(selectedTags);
229+
230+
return new NameSample(sentence.toArray(new String[0]),
231+
names.toArray(new Span[0]), isClearAdaptiveData);
232+
} else if (line != null) {
233+
// Just filter out empty events, if two lines in a row are empty
234+
return read();
235+
} else {
236+
// source stream is not returning anymore lines
237+
return null;
238+
}
239+
}
240+
241+
@Override
242+
public void reset() throws IOException, UnsupportedOperationException {
243+
lineStream.reset();
244+
}
245+
246+
@Override
247+
public void close() throws IOException {
248+
lineStream.close();
249+
}
250+
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.formats;
19+
20+
import java.io.IOException;
21+
22+
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
23+
import opennlp.tools.cmdline.StreamFactoryRegistry;
24+
import opennlp.tools.cmdline.TerminateToolException;
25+
import opennlp.tools.cmdline.params.BasicFormatParams;
26+
import opennlp.tools.commons.Internal;
27+
import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer;
28+
import opennlp.tools.namefind.NameSample;
29+
import opennlp.tools.util.ObjectStream;
30+
31+
/**
32+
* <b>Note:</b>
33+
* Do not use this class, internal use only!
34+
*
35+
* @see GermEval2014NameSampleStream
36+
*/
37+
@Internal
38+
public class GermEval2014NameSampleStreamFactory extends
39+
LanguageSampleStreamFactory<NameSample, GermEval2014NameSampleStreamFactory.Parameters> {
40+
41+
public interface Parameters extends BasicFormatParams {
42+
@ParameterDescription(valueName = "per,loc,org,misc")
43+
String getTypes();
44+
45+
@ParameterDescription(valueName = "outer|inner", description = "NER annotation layer to use. " +
46+
"Use 'outer' for top-level entities or 'inner' for nested/embedded entities.")
47+
String getLayer();
48+
}
49+
50+
public static void registerFactory() {
51+
StreamFactoryRegistry.registerFactory(NameSample.class,
52+
"germeval2014", new GermEval2014NameSampleStreamFactory(Parameters.class));
53+
}
54+
55+
protected GermEval2014NameSampleStreamFactory(final Class<Parameters> params) {
56+
super(params);
57+
}
58+
59+
@Override
60+
public ObjectStream<NameSample> create(final String[] args) {
61+
62+
final Parameters params = validateBasicFormatParameters(args, Parameters.class);
63+
64+
language = "deu";
65+
66+
int typesToGenerate = 0;
67+
68+
if (params.getTypes().contains("per")) {
69+
typesToGenerate = typesToGenerate |
70+
GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES;
71+
}
72+
if (params.getTypes().contains("org")) {
73+
typesToGenerate = typesToGenerate |
74+
GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES;
75+
}
76+
if (params.getTypes().contains("loc")) {
77+
typesToGenerate = typesToGenerate |
78+
GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES;
79+
}
80+
if (params.getTypes().contains("misc")) {
81+
typesToGenerate = typesToGenerate |
82+
GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES;
83+
}
84+
85+
final NerLayer layer;
86+
final String layerParam = params.getLayer();
87+
if (layerParam == null || "outer".equals(layerParam)) {
88+
layer = NerLayer.OUTER;
89+
} else if ("inner".equals(layerParam)) {
90+
layer = NerLayer.INNER;
91+
} else {
92+
throw new TerminateToolException(1, "Unsupported layer: " + layerParam
93+
+ ". Use 'outer' or 'inner'.");
94+
}
95+
96+
try {
97+
return new GermEval2014NameSampleStream(
98+
FormatUtil.createInputStreamFactory(params.getData()), typesToGenerate, layer);
99+
} catch (final IOException e) {
100+
throw new TerminateToolException(-1,
101+
"IO Error while creating an Input Stream: " + e.getMessage(), e);
102+
}
103+
}
104+
}

0 commit comments

Comments
 (0)