Skip to content

Commit d48c39e

Browse files
authored
Merge pull request #275 from h1alexbel/257-opennlp
bug(#257): Apache OpenNLP instead of Stanford Core NLP
2 parents 9bb5b9a + fa6a507 commit d48c39e

File tree

3 files changed

+64
-59
lines changed

3 files changed

+64
-59
lines changed

pom.xml

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -108,17 +108,6 @@ SOFTWARE.
108108
<artifactId>jcabi-manifests</artifactId>
109109
<!-- version from the parent pom -->
110110
</dependency>
111-
<dependency>
112-
<groupId>edu.stanford.nlp</groupId>
113-
<artifactId>stanford-corenlp</artifactId>
114-
<version>4.5.8</version>
115-
</dependency>
116-
<dependency>
117-
<groupId>edu.stanford.nlp</groupId>
118-
<artifactId>stanford-corenlp</artifactId>
119-
<version>4.5.8</version>
120-
<classifier>models</classifier>
121-
</dependency>
122111
<dependency>
123112
<groupId>org.yaml</groupId>
124113
<artifactId>snakeyaml</artifactId>
@@ -157,6 +146,16 @@ SOFTWARE.
157146
<artifactId>xnav</artifactId>
158147
<version>0.1.3</version>
159148
</dependency>
149+
<dependency>
150+
<groupId>org.apache.opennlp</groupId>
151+
<artifactId>opennlp-tools</artifactId>
152+
<version>2.1.1</version>
153+
</dependency>
154+
<dependency>
155+
<groupId>com.google.code.findbugs</groupId>
156+
<artifactId>jsr305</artifactId>
157+
<version>3.0.2</version>
158+
</dependency>
160159
<dependency>
161160
<groupId>com.yegor256</groupId>
162161
<artifactId>tojos</artifactId>

src/main/java/org/eolang/lints/misc/LtTestNotVerb.java

Lines changed: 53 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,19 @@
2424
package org.eolang.lints.misc;
2525

2626
import com.jcabi.xml.XML;
27-
import edu.stanford.nlp.ling.CoreAnnotations;
28-
import edu.stanford.nlp.pipeline.CoreDocument;
29-
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
3027
import java.io.IOException;
28+
import java.net.URI;
29+
import java.net.URISyntaxException;
3130
import java.util.Arrays;
3231
import java.util.Collection;
3332
import java.util.LinkedList;
3433
import java.util.Locale;
35-
import java.util.Properties;
3634
import java.util.regex.Pattern;
37-
import java.util.stream.Collectors;
3835
import java.util.stream.Stream;
36+
import opennlp.tools.postag.POSModel;
37+
import opennlp.tools.postag.POSTaggerME;
3938
import org.cactoos.io.ResourceOf;
39+
import org.cactoos.list.ListOf;
4040
import org.cactoos.text.TextOf;
4141
import org.cactoos.text.UncheckedText;
4242
import org.eolang.lints.Defect;
@@ -45,20 +45,15 @@
4545

4646
/**
4747
* Lint that checks test object name is a verb in singular.
48-
* This lint uses <a href="https://stanfordnlp.github.io/CoreNLP/">Stanford CoreNLP model</a>
48+
* This lint uses <a href="https://opennlp.apache.org/">OpenNLP models</a>
4949
* with POS tagging capabilities in order to determine the part of speech and
50-
* tense for test object name. Originally, we used <a href="https://opennlp.apache.org/">OpenNLP</a>
51-
* library to do that, but switched to the Stanford CoreNLP, due to merging all
52-
* verb tags into single `VERB` POS tag, that sacrifices important information
53-
* for us about verb tenses, and appeared in OpenNLP 2.4.0+. You can read more
54-
* about the reason of this <a href="https://github.com/objectionary/lints/issues/129">here</a>
55-
* and <a href="https://github.com/objectionary/lints/pull/126#issuecomment-2531121073">here</a>.
50+
* tense for test object name.
5651
* @since 0.0.22
57-
* @todo #129:60min Library stanford-corenlp-4.5.7-models.jar takes too much in size.
58-
* Currently, JAR takes ~452mb, which may cause some troubles to the users of
59-
* the lints library. Let's think what we can do about this. We should check is
60-
* it possible to get rid of this dependency and download models from the other
61-
* source.
52+
* @todo #257:60min Configure model download only during the build and place into the JAR.
53+
* Currently, we download model file each time when creating the lint, which may
54+
* be slow in the usage of this lint. Instead, let's configure maven to download
55+
* model file during the build, and place into JAR, so lint will be able to locate
56+
* file from resources faster.
6257
*/
6358
public final class LtTestNotVerb implements Lint<XML> {
6459

@@ -68,46 +63,50 @@ public final class LtTestNotVerb implements Lint<XML> {
6863
private static final Pattern KEBAB = Pattern.compile("-");
6964

7065
/**
71-
* Properties of NLP pipeline.
66+
* Part-Of-Speech tagger.
7267
*/
73-
private final Properties properties;
68+
private final POSTaggerME model;
7469

7570
/**
7671
* Ctor.
77-
* @param props Pipeline properties
7872
*/
79-
public LtTestNotVerb(final Properties props) {
80-
this.properties = new Properties(props);
73+
public LtTestNotVerb() {
74+
this(LtTestNotVerb.defaultPosModel());
8175
}
8276

8377
/**
8478
* Ctor.
79+
* @param mdl Part-Of-Speech model
8580
*/
86-
public LtTestNotVerb() {
87-
this(LtTestNotVerb.defaults());
81+
public LtTestNotVerb(final POSModel mdl) {
82+
this(new POSTaggerME(mdl));
83+
}
84+
85+
/**
86+
* Ctor.
87+
* @param pos Part-Of-Speech tagger
88+
*/
89+
public LtTestNotVerb(final POSTaggerME pos) {
90+
this.model = pos;
8891
}
8992

9093
@Override
9194
public Collection<Defect> defects(final XML xmir) throws IOException {
9295
final Collection<Defect> defects = new LinkedList<>();
93-
final StanfordCoreNLP pipeline = new StanfordCoreNLP(this.properties);
9496
for (final XML object : xmir.nodes("/program[metas/meta[head='tests']]/objects/o[@name]")) {
9597
final String name = object.xpath("@name").get(0);
96-
final CoreDocument doc = new CoreDocument(
97-
Stream
98-
.concat(
99-
Stream.of("It"),
100-
Arrays.stream(LtTestNotVerb.KEBAB.split(name))
101-
)
102-
.map(s -> s.toLowerCase(Locale.ROOT))
103-
.collect(Collectors.joining(" "))
104-
);
105-
pipeline.annotate(doc);
106-
if (
107-
!"VBZ".equals(
108-
doc.tokens().get(1).get(CoreAnnotations.PartOfSpeechAnnotation.class)
98+
final String first = new ListOf<>(
99+
this.model.tag(
100+
Stream
101+
.concat(
102+
Stream.of("It"),
103+
Arrays.stream(LtTestNotVerb.KEBAB.split(name))
104+
)
105+
.map(s -> s.toLowerCase(Locale.ROOT))
106+
.toArray(String[]::new)
109107
)
110-
) {
108+
).get(1);
109+
if (!"VBZ".equals(first)) {
111110
defects.add(
112111
new Defect.Default(
113112
"unit-test-is-not-verb",
@@ -141,13 +140,20 @@ public String name() {
141140
return "unit-test-is-not-verb";
142141
}
143142

144-
/**
145-
* Prestructor for default properties.
146-
* @return Properties.
147-
*/
148-
private static Properties defaults() {
149-
final Properties props = new Properties();
150-
props.setProperty("annotators", "tokenize,pos");
151-
return props;
143+
private static POSModel defaultPosModel() {
144+
try {
145+
return new POSModel(
146+
new URI("https://opennlp.sourceforge.net/models-1.5/en-pos-perceptron.bin")
147+
.toURL()
148+
);
149+
} catch (final IOException exception) {
150+
throw new IllegalStateException(
151+
"Failed to read from I/O", exception
152+
);
153+
} catch (final URISyntaxException exception) {
154+
throw new IllegalStateException(
155+
"URI syntax is broken", exception
156+
);
157+
}
152158
}
153159
}

src/test/java/org/eolang/lints/misc/LtTestIsNotVerbTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ void catchesBadName() throws Exception {
5151
),
5252
Matchers.allOf(
5353
Matchers.<Defect>iterableWithSize(40),
54-
Matchers.<Defect>everyItem(new DefectMatcher())
54+
Matchers.everyItem(new DefectMatcher())
5555
)
5656
);
5757
}

0 commit comments

Comments
 (0)