Merge pull request #275 from h1alexbel/257-opennlp

yegor256 · web-flow · commit d48c39e1b588 · 2025-01-23T16:46:49.000+03:00
bug(#257): Apache OpenNLP instead of Stanford Core NLP
diff --git a/pom.xml b/pom.xml
@@ -108,17 +108,6 @@ SOFTWARE.
       <artifactId>jcabi-manifests</artifactId>
       <!-- version from the parent pom -->
     </dependency>
-    <dependency>
-      <groupId>edu.stanford.nlp</groupId>
-      <artifactId>stanford-corenlp</artifactId>
-      <version>4.5.8</version>
-    </dependency>
-    <dependency>
-      <groupId>edu.stanford.nlp</groupId>
-      <artifactId>stanford-corenlp</artifactId>
-      <version>4.5.8</version>
-      <classifier>models</classifier>
-    </dependency>
     <dependency>
       <groupId>org.yaml</groupId>
       <artifactId>snakeyaml</artifactId>
@@ -157,6 +146,16 @@ SOFTWARE.
       <artifactId>xnav</artifactId>
       <version>0.1.3</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.opennlp</groupId>
+      <artifactId>opennlp-tools</artifactId>
+      <version>2.1.1</version>
+    </dependency>
+    <dependency>
+      <groupId>com.google.code.findbugs</groupId>
+      <artifactId>jsr305</artifactId>
+      <version>3.0.2</version>
+    </dependency>
     <dependency>
       <groupId>com.yegor256</groupId>
       <artifactId>tojos</artifactId>
diff --git a/src/main/java/org/eolang/lints/misc/LtTestNotVerb.java b/src/main/java/org/eolang/lints/misc/LtTestNotVerb.java
@@ -24,19 +24,19 @@
 package org.eolang.lints.misc;
 
 import com.jcabi.xml.XML;
-import edu.stanford.nlp.ling.CoreAnnotations;
-import edu.stanford.nlp.pipeline.CoreDocument;
-import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.LinkedList;
 import java.util.Locale;
-import java.util.Properties;
 import java.util.regex.Pattern;
-import java.util.stream.Collectors;
 import java.util.stream.Stream;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTaggerME;
 import org.cactoos.io.ResourceOf;
+import org.cactoos.list.ListOf;
 import org.cactoos.text.TextOf;
 import org.cactoos.text.UncheckedText;
 import org.eolang.lints.Defect;
@@ -45,20 +45,15 @@
 
 /**
  * Lint that checks test object name is a verb in singular.
- * This lint uses <a href="https://stanfordnlp.github.io/CoreNLP/">Stanford CoreNLP model</a>
+ * This lint uses <a href="https://opennlp.apache.org/">OpenNLP models</a>
  * with POS tagging capabilities in order to determine the part of speech and
- * tense for test object name. Originally, we used <a href="https://opennlp.apache.org/">OpenNLP</a>
- * library to do that, but switched to the Stanford CoreNLP, due to merging all
- * verb tags into single `VERB` POS tag, that sacrifices important information
- * for us about verb tenses, and appeared in OpenNLP 2.4.0+. You can read more
- * about the reason of this <a href="https://github.com/objectionary/lints/issues/129">here</a>
- * and <a href="https://github.com/objectionary/lints/pull/126#issuecomment-2531121073">here</a>.
+ * tense for test object name.
  * @since 0.0.22
- * @todo #129:60min Library stanford-corenlp-4.5.7-models.jar takes too much in size.
- *  Currently, JAR takes ~452mb, which may cause some troubles to the users of
- *  the lints library. Let's think what we can do about this. We should check is
- *  it possible to get rid of this dependency and download models from the other
- *  source.
+ * @todo #257:60min Configure model download only during the build and place into the JAR.
+ *  Currently, we download model file each time when creating the lint, which may
+ *  be slow in the usage of this lint. Instead, let's configure maven to download
+ *  model file during the build, and place into JAR, so lint will be able to locate
+ *  file from resources faster.
  */
 public final class LtTestNotVerb implements Lint<XML> {
 
@@ -68,46 +63,50 @@ public final class LtTestNotVerb implements Lint<XML> {
     private static final Pattern KEBAB = Pattern.compile("-");
 
     /**
-     * Properties of NLP pipeline.
+     * Part-Of-Speech tagger.
      */
-    private final Properties properties;
+    private final POSTaggerME model;
 
     /**
      * Ctor.
-     * @param props Pipeline properties
      */
-    public LtTestNotVerb(final Properties props) {
-        this.properties = new Properties(props);
+    public LtTestNotVerb() {
+        this(LtTestNotVerb.defaultPosModel());
     }
 
     /**
      * Ctor.
+     * @param mdl Part-Of-Speech model
      */
-    public LtTestNotVerb() {
-        this(LtTestNotVerb.defaults());
+    public LtTestNotVerb(final POSModel mdl) {
+        this(new POSTaggerME(mdl));
+    }
+
+    /**
+     * Ctor.
+     * @param pos Part-Of-Speech tagger
+     */
+    public LtTestNotVerb(final POSTaggerME pos) {
+        this.model = pos;
     }
 
     @Override
     public Collection<Defect> defects(final XML xmir) throws IOException {
         final Collection<Defect> defects = new LinkedList<>();
-        final StanfordCoreNLP pipeline = new StanfordCoreNLP(this.properties);
         for (final XML object : xmir.nodes("/program[metas/meta[head='tests']]/objects/o[@name]")) {
             final String name = object.xpath("@name").get(0);
-            final CoreDocument doc = new CoreDocument(
-                Stream
-                    .concat(
-                        Stream.of("It"),
-                        Arrays.stream(LtTestNotVerb.KEBAB.split(name))
-                    )
-                    .map(s -> s.toLowerCase(Locale.ROOT))
-                    .collect(Collectors.joining(" "))
-            );
-            pipeline.annotate(doc);
-            if (
-                !"VBZ".equals(
-                    doc.tokens().get(1).get(CoreAnnotations.PartOfSpeechAnnotation.class)
+            final String first = new ListOf<>(
+                this.model.tag(
+                    Stream
+                        .concat(
+                            Stream.of("It"),
+                            Arrays.stream(LtTestNotVerb.KEBAB.split(name))
+                        )
+                        .map(s -> s.toLowerCase(Locale.ROOT))
+                        .toArray(String[]::new)
                 )
-            ) {
+            ).get(1);
+            if (!"VBZ".equals(first)) {
                 defects.add(
                     new Defect.Default(
                         "unit-test-is-not-verb",
@@ -141,13 +140,20 @@ public String name() {
         return "unit-test-is-not-verb";
     }
 
-    /**
-     * Prestructor for default properties.
-     * @return Properties.
-     */
-    private static Properties defaults() {
-        final Properties props = new Properties();
-        props.setProperty("annotators", "tokenize,pos");
-        return props;
+    private static POSModel defaultPosModel() {
+        try {
+            return new POSModel(
+                new URI("https://opennlp.sourceforge.net/models-1.5/en-pos-perceptron.bin")
+                    .toURL()
+            );
+        } catch (final IOException exception) {
+            throw new IllegalStateException(
+                "Failed to read from I/O", exception
+            );
+        } catch (final URISyntaxException exception) {
+            throw new IllegalStateException(
+                "URI syntax is broken", exception
+            );
+        }
     }
 }
diff --git a/src/test/java/org/eolang/lints/misc/LtTestIsNotVerbTest.java b/src/test/java/org/eolang/lints/misc/LtTestIsNotVerbTest.java
@@ -51,7 +51,7 @@ void catchesBadName() throws Exception {
             ),
             Matchers.allOf(
                 Matchers.<Defect>iterableWithSize(40),
-                Matchers.<Defect>everyItem(new DefectMatcher())
+                Matchers.everyItem(new DefectMatcher())
             )
         );
     }

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ void catchesBadName() throws Exception {`
`51`	`51`	`),`
`52`	`52`	`Matchers.allOf(`
`53`	`53`	`Matchers.<Defect>iterableWithSize(40),`
`54`		`- Matchers.<Defect>everyItem(new DefectMatcher())`
	`54`	`+ Matchers.everyItem(new DefectMatcher())`
`55`	`55`	`)`
`56`	`56`	`);`
`57`	`57`	`}`