Skip to content

Commit 214536c

Browse files
authored
Merge pull request #101 from m-stoeckel/dev-tools
Added AnnotationDropper
2 parents 1b3272e + d80a01f commit 214536c

File tree

2 files changed

+351
-0
lines changed

2 files changed

+351
-0
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
package org.texttechnologylab.DockerUnifiedUIMAInterface.tools;
2+
3+
import java.util.HashSet;
4+
import java.util.List;
5+
import java.util.Set;
6+
import java.util.function.Predicate;
7+
import java.util.stream.Collectors;
8+
9+
import org.apache.uima.UimaContext;
10+
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
11+
import org.apache.uima.cas.Type;
12+
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
13+
import org.apache.uima.fit.descriptor.ConfigurationParameter;
14+
import org.apache.uima.jcas.JCas;
15+
import org.apache.uima.resource.ResourceInitializationException;
16+
17+
/**
18+
* A {@link JCasAnnotator_ImplBase JCasAnnotator} that drops or retains specific
19+
* types from processed CASes.
20+
*
21+
* @author Manuel Stoeckel
22+
* @version 0.2.0
23+
*/
24+
public class AnnotationDropper extends JCasAnnotator_ImplBase {
25+
/**
26+
* The types to drop from the CAS.
27+
* Must be the fully qualified class name of the type.
28+
*
29+
* @apiNote You can use the
30+
* {@link org.apache.uima.jcas.cas.TOP#_TypeName _TypeName} field of
31+
* any {@link org.apache.uima.jcas.tcas.Annotation annotation} to
32+
* access the fully qualified class name for convenience.
33+
* @apiNote Only one of {@link #PARAM_TYPES_TO_DROP} or
34+
* {@link #PARAM_TYPES_TO_RETAIN} can be set.
35+
*/
36+
public static final String PARAM_TYPES_TO_DROP = "typesToDrop";
37+
@ConfigurationParameter(name = PARAM_TYPES_TO_DROP, mandatory = false, defaultValue = {})
38+
private String[] paramTypesToDrop;
39+
40+
/**
41+
* The types to drop from the CAS.
42+
* Must be the fully qualified class name of the type.
43+
*
44+
* @apiNote WARNING: Make sure to include integral base types like
45+
* {@link org.apache.uima.jcas.cas.Sofa Sofa}!
46+
* @apiNote You can use the
47+
* {@link org.apache.uima.jcas.cas.TOP#_TypeName _TypeName} field of
48+
* any {@link org.apache.uima.jcas.tcas.Annotation annotation} to
49+
* access the fully qualified class name for convenience.
50+
* @apiNote Only one of {@link #PARAM_TYPES_TO_DROP} or
51+
* {@link #PARAM_TYPES_TO_RETAIN} can be set.
52+
*/
53+
public static final String PARAM_TYPES_TO_RETAIN = "typesToRetain";
54+
@ConfigurationParameter(name = PARAM_TYPES_TO_RETAIN, mandatory = false, defaultValue = {})
55+
private String[] paramTypesToRetain;
56+
57+
enum Mode {
58+
_UNSET,
59+
RETAIN,
60+
DROP
61+
}
62+
63+
private Mode mode = Mode._UNSET;
64+
private HashSet<String> typeSet = new HashSet<>();
65+
66+
/**
67+
* @return The mode of operation.
68+
* Will always be either {@link Mode#RETAIN} or {@link Mode#DROP}.
69+
* @throws IllegalStateException If the mode is unset (i.e. prior to
70+
* {@link #initialize initialization}).
71+
*/
72+
public Mode getMode() {
73+
switch (this.mode) {
74+
case RETAIN:
75+
return Mode.RETAIN;
76+
case DROP:
77+
return Mode.DROP;
78+
case _UNSET:
79+
default:
80+
throw new IllegalStateException("Mode is unset");
81+
}
82+
}
83+
84+
/**
85+
* @return An immutable copy of the {@link #typeSet}.
86+
* @apiNote The returned set can only be empty prior to
87+
* {@link #initialize initialization}.
88+
*/
89+
public Set<String> getTypeSet() {
90+
return Set.copyOf(this.typeSet);
91+
}
92+
93+
/**
94+
* Initializes the annotator.
95+
*
96+
* You can either drop or retain specific types from the CAS.
97+
* The mode of operations is determined automatically based on the
98+
* configuration.
99+
*
100+
* @throws IllegalArgumentException If both parameters
101+
* {@link #PARAM_TYPES_TO_DROP} and
102+
* {@link #PARAM_TYPES_TO_RETAIN} are set.
103+
* @throws IllegalArgumentException If both parameters are empty.
104+
*/
105+
@Override
106+
public void initialize(UimaContext context) throws ResourceInitializationException {
107+
super.initialize(context);
108+
109+
if (this.paramTypesToDrop.length == 0 && this.paramTypesToRetain.length == 0) {
110+
throw new ResourceInitializationException(
111+
new IllegalArgumentException("At least one of PARAM_TYPES_TO_DROP or PARAM_TYPES_TO_RETAIN must be set"));
112+
} else if (this.paramTypesToDrop.length > 0 && this.paramTypesToRetain.length > 0) {
113+
throw new ResourceInitializationException(
114+
new IllegalArgumentException("Only one of PARAM_TYPES_TO_DROP or PARAM_TYPES_TO_RETAIN can be set"));
115+
}
116+
117+
if (this.paramTypesToDrop.length > 0) {
118+
this.mode = Mode.DROP;
119+
this.typeSet = new HashSet<>(List.of(this.paramTypesToDrop));
120+
} else {
121+
this.mode = Mode.RETAIN;
122+
this.typeSet = new HashSet<>(List.of(this.paramTypesToRetain));
123+
}
124+
}
125+
126+
@Override
127+
public void process(JCas aJCas) throws AnalysisEngineProcessException {
128+
switch (this.mode) {
129+
case RETAIN:
130+
retainTypes(aJCas, this.typeSet);
131+
break;
132+
case DROP:
133+
dropTypes(aJCas, this.typeSet);
134+
break;
135+
case _UNSET:
136+
default:
137+
throw new IllegalStateException("Mode is unset");
138+
}
139+
}
140+
141+
static void retainTypes(JCas aJCas, Set<String> typesToRetain) {
142+
Set<String> typesToDrop = aJCas.getAnnotationIndex().iterator()
143+
.stream()
144+
.map(a -> a.getType().getName())
145+
.distinct()
146+
.filter(Predicate.not(typesToRetain::contains))
147+
.collect(Collectors.toSet());
148+
149+
dropTypes(aJCas, typesToDrop);
150+
}
151+
152+
static void dropTypes(JCas aJCas, Iterable<String> typesToDrop) {
153+
for (String typeName : typesToDrop) {
154+
dropType(aJCas, typeName);
155+
}
156+
}
157+
158+
static void dropType(JCas aJCas, String typeName) {
159+
Type type = aJCas.getTypeSystem().getType(typeName);
160+
aJCas.select(type).forEach(a -> a.removeFromIndexes(aJCas));
161+
}
162+
}
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
package org.texttechnologylab.DockerUnifiedUIMAInterface.tools;
2+
3+
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
4+
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
5+
6+
import java.io.IOException;
7+
import java.net.URISyntaxException;
8+
import java.net.UnknownHostException;
9+
import java.util.Arrays;
10+
import java.util.List;
11+
import java.util.stream.Collectors;
12+
13+
import org.apache.uima.analysis_engine.AnalysisEngine;
14+
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
15+
import org.apache.uima.cas.CASException;
16+
import org.apache.uima.fit.factory.JCasFactory;
17+
import org.apache.uima.fit.util.JCasUtil;
18+
import org.apache.uima.jcas.JCas;
19+
import org.apache.uima.jcas.cas.Sofa;
20+
import org.apache.uima.resource.ResourceInitializationException;
21+
import org.junit.jupiter.api.AfterAll;
22+
import org.junit.jupiter.api.AfterEach;
23+
import org.junit.jupiter.api.Assertions;
24+
import org.junit.jupiter.api.BeforeAll;
25+
import org.junit.jupiter.api.Test;
26+
import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer;
27+
import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver;
28+
import org.xml.sax.SAXException;
29+
30+
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
31+
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
32+
33+
public class TestAnnotationDropper {
34+
static JCas jCas;
35+
static DUUIComposer composer;
36+
37+
static final List<String[]> sentences = Arrays.asList(
38+
new String[] { "This", "is", "a", "sentence", "." },
39+
new String[] { "This", "is", "another", "sentence", "." },
40+
new String[] { "This", "is", "a", "third", "sentence", "." });
41+
42+
@BeforeAll
43+
static void setUp() throws ResourceInitializationException {
44+
try {
45+
jCas = JCasFactory.createJCas();
46+
} catch (ResourceInitializationException | CASException e) {
47+
throw new ResourceInitializationException(e);
48+
}
49+
resetCas();
50+
51+
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size());
52+
Assertions.assertEquals(16, JCasUtil.select(jCas, Token.class).size());
53+
54+
try {
55+
composer = new DUUIComposer()
56+
.withSkipVerification(true)
57+
.withWorkers(1);
58+
} catch (URISyntaxException e) {
59+
throw new ResourceInitializationException(e);
60+
}
61+
62+
DUUIUIMADriver uimaDriver = new DUUIUIMADriver().withDebug(false);
63+
composer.addDriver(uimaDriver);
64+
}
65+
66+
@AfterEach
67+
public void afterEach() throws IOException, SAXException {
68+
composer.resetPipeline();
69+
resetCas();
70+
}
71+
72+
static void resetCas() {
73+
jCas.reset();
74+
jCas.setDocumentText(sentences.stream().flatMap(Arrays::stream).collect(Collectors.joining(" ")));
75+
int tokenOffset = 0;
76+
int sentenceOffset = 0;
77+
for (String[] sentence : sentences) {
78+
String text = String.join(" ", sentence);
79+
jCas.addFsToIndexes(new Sentence(jCas, sentenceOffset, sentenceOffset + text.length()));
80+
sentenceOffset += text.length() + 1;
81+
for (String token : sentence) {
82+
jCas.addFsToIndexes(new Token(jCas, tokenOffset, tokenOffset + token.length()));
83+
tokenOffset += token.length() + 1;
84+
}
85+
}
86+
}
87+
88+
@AfterAll
89+
static void afterAll() throws UnknownHostException {
90+
composer.shutdown();
91+
}
92+
93+
@Test
94+
public void testTypesToRetain() throws ResourceInitializationException, CASException {
95+
try {
96+
AnalysisEngine dropper = createEngine(
97+
AnnotationDropper.class,
98+
AnnotationDropper.PARAM_TYPES_TO_RETAIN,
99+
new String[] {
100+
Sofa._TypeName,
101+
org.apache.uima.jcas.tcas.DocumentAnnotation._TypeName,
102+
org.texttechnologylab.annotation.DocumentAnnotation._TypeName,
103+
Sentence._TypeName,
104+
});
105+
106+
try {
107+
dropper.process(jCas);
108+
} catch (AnalysisEngineProcessException e) {
109+
throw new RuntimeException(e);
110+
}
111+
112+
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size());
113+
Assertions.assertEquals(0, JCasUtil.select(jCas, Token.class).size());
114+
} catch (Exception e) {
115+
throw new RuntimeException(e);
116+
}
117+
}
118+
119+
@Test
120+
public void testTypesToDrop() throws ResourceInitializationException, CASException {
121+
try {
122+
AnalysisEngine dropper = createEngine(
123+
AnnotationDropper.class,
124+
AnnotationDropper.PARAM_TYPES_TO_DROP,
125+
new String[] {
126+
Token._TypeName,
127+
});
128+
129+
try {
130+
dropper.process(jCas);
131+
} catch (AnalysisEngineProcessException e) {
132+
throw new RuntimeException(e);
133+
}
134+
135+
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size());
136+
Assertions.assertEquals(0, JCasUtil.select(jCas, Token.class).size());
137+
} catch (Exception e) {
138+
throw new RuntimeException(e);
139+
}
140+
}
141+
142+
@Test
143+
public void testTypesToRetainDUUI() {
144+
try {
145+
composer.add(new DUUIUIMADriver.Component(createEngineDescription(
146+
AnnotationDropper.class,
147+
AnnotationDropper.PARAM_TYPES_TO_RETAIN,
148+
new String[] {
149+
Sofa._TypeName,
150+
Sentence._TypeName,
151+
})));
152+
153+
try {
154+
composer.run(jCas);
155+
} catch (Exception e) {
156+
Assertions.fail("DUUIComposer failed", e);
157+
}
158+
159+
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size());
160+
Assertions.assertEquals(0, JCasUtil.select(jCas, Token.class).size());
161+
} catch (Exception e) {
162+
throw new RuntimeException(e);
163+
}
164+
}
165+
166+
@Test
167+
public void testTypesToDropDUUI() {
168+
try {
169+
composer.add(new DUUIUIMADriver.Component(createEngineDescription(
170+
AnnotationDropper.class,
171+
AnnotationDropper.PARAM_TYPES_TO_DROP,
172+
new String[] {
173+
Token._TypeName,
174+
})));
175+
176+
try {
177+
composer.run(jCas);
178+
} catch (Exception e) {
179+
Assertions.fail("DUUIComposer failed", e);
180+
}
181+
182+
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size());
183+
Assertions.assertEquals(0, JCasUtil.select(jCas, Token.class).size());
184+
} catch (Exception e) {
185+
throw new RuntimeException(e);
186+
}
187+
}
188+
189+
}

0 commit comments

Comments
 (0)