elsevierlabs-os
diff --git a/‎README.md‎
Lines changed: 65 additions & 1 deletion b/‎README.md‎
Lines changed: 65 additions & 1 deletion
diff --git a/‎pom.xml‎
Lines changed: 27 additions & 0 deletions b/‎pom.xml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎solr/schema-additions.xml‎
Lines changed: 45 additions & 0 deletions b/‎solr/schema-additions.xml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎solr/update-plugin.sh‎
Lines changed: 27 additions & 0 deletions b/‎solr/update-plugin.sh‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎solr/update-schema.sh‎
Lines changed: 59 additions & 0 deletions b/‎solr/update-schema.sh‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎src/main/java/com/elsevier/asp/AnalyzerUtils.java‎
Lines changed: 32 additions & 0 deletions b/‎src/main/java/com/elsevier/asp/AnalyzerUtils.java‎
Lines changed: 32 additions & 0 deletions
@@ -1,2 +1,66 @@
 # anserini-solr-plugin
-Solr Plugin that supports Anserini style query expansion and reranking against Solr indexes
+
+Solr Plugin that supports [Anserini](https://github.com/castorini/anserini) style query expansion and reranking against Solr indexes.
+
+### Description
+
+Supports following similarity implementations for paragraph text.
+
+* **Query Likelihood (QL)** -- via built-in DirichletLM Similarity
+* **BM25** -- via built in BM25 Similarity (default)
+
+Supports following query rewriting functionality (query A).
+
+* **Bag of Words (BoW)** -- constructs OR query out of individual terms
+* **Sequential Dependency Model (SDM)** -- constructs query out of individual terms, bigrams (ordered and unordered).
+
+Supports following query reranking functionality (query B). Constructs more complex query based on results returned from Query A and applies it to the top ${rerankCutoff} results from Query A.
+
+* **Relevance Model 3 (RM3)** -- extracts feature vectors from query and results from query A and top feature vectors from top terms from top documents of the result, and interpolates them to create new reranking query.
+* **Axiomatic Reranker** -- computes mutual information between query terms and terms in top ${rerankedCutoff} documents, plus random documents not from top results, and scored. Uses top K terms to create new reranking query.
+* **Identity Reranker** -- a do-nothing reranker, passes the results from query A unchanged. Useful for debugging.
+
+### Building
+
+Steps to build the JAR file from the code and deploy to Solr are as follows:
+
+```bash
+$ mvn clean package
+$ mkdir -p ${SOLR_HOME}/server/solr/lib
+$ cp target/anserini-solr-plugins-1.0-SNAPSHOT.jar ${SOLR_HOME}/server/solr/lib/
+```
+
+### Configuration
+
+The plugin expects additional field types `text_bm` and `text_ql` to be defined in managed-schema.xml of the `${SOLR_HOME}/server/solr/${INDEX_NAME}/conf/managed-schema`. These can be found in [solr/schema-additions.xml](solr/schema-additions.xml). This is needed to support the QL and BM25 similarities defined above.
+
+The plugin requires two fields `para_text_bm` and `para_text_ql` with field types `text_bm` and `text_ql` as defined in the previous step. There are no other specific field requirements. An example schema can be found in [solr/update-schema.sh](solr/update-schema.sh).
+
+Please restart Solr after these steps so its class loader can pick up the new JAR file you provided it in the Building section.
+
+The plugin is defined (in `${SOLR_HOME}/server/solr/${INDEX_NAME}/conf/solrconfig.xml`) as detailed in [solr/update-plugin.sh](solr/update-plugin.sh).
+
+### Running
+
+Plugin can be run using HTTP GET requests. A typical URL would be something like the following.
+
+```
+http://localhost:8983/solr/my_index_name/anserini?q=what+are+nails+made+of
+```
+
+Main parameters to tweak behavior are listed below.
+
+* q -- question, URL encoded. Mandatory parameter.
+* sim -- ql (Query Likelihood) or bm (BM25), default bm.
+* qtyoe -- Query Expansion type. Valid values are bow (Bag of Words) or sdm (Sequential Dependency Model), default is bow.
+* rtype -- Reranking type. Valid values are ax (Axiomatic), rm3 (Relevance Model 3), and id (Identity), default is rm3.
+* start and rows -- for pagination, defaults to 0 and 10 respectively.
+
+For certain qtype and rtype, there are some additional parameters that are listed in [solr/update-plugin.sh](solr/update-plugin.sh) with prefixes "sdm.", "ax.", and "rm3."
+
+### Dependencies
+
+Currently the only dependency is Solr, since we have copy-pasted relevant parts of Anserini functionality in the interests of time. Plan is to make Anserini a dependency and leverage its functionality directly.
+
+* Solr 8.1.1
+
@@ -0,0 +1,27 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>com.elsevier</groupId>
+  <artifactId>anserini-solr-plugins</artifactId>
+  <packaging>jar</packaging>
+  <version>1.0-SNAPSHOT</version>
+  <name>anserini-solr-plugins</name>
+  <url>http://maven.apache.org</url>
+
+  <dependencies>
+    <!-- https://mvnrepository.com/artifact/org.apache.solr/solr-core -->
+    <dependency>
+      <groupId>org.apache.solr</groupId>
+      <artifactId>solr-core</artifactId>
+      <version>8.1.1</version>
+    </dependency>
+    <!-- https://mvnrepository.com/artifact/junit/junit -->
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.12</version>
+      <scope>test</scope>
+    </dependency>
+
+  </dependencies>
+</project>
@@ -0,0 +1,45 @@
+<!-- following XML blocks must be copy-pasted inside the schema element of managed-schema.
+     Don't care about ordering, managed-schema is regenerated by Solr and everything will
+     be rearranged anyway.
+     This needs to be done before setting up the fields.
+-->
+
+  <similarity class="solr.SchemaSimilarityFactory">
+    <str name="defaultSimFromFieldType">text_bm</str>
+  </similarity>
+
+  <fieldType name="text_bm" class="solr.TextField" positionIncrementGap="100" multiValued="true">
+    <analyzer type="index">
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+      <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+      <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
+      <filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+    <similarity class="solr.BM25SimilarityFactory">
+      <str name="b">0.75</str>
+      <str name="k1">1.2</str>
+    </similarity>
+  </fieldType>
+
+  <fieldType name="text_ql" class="solr.TextField" positionIncrementGap="100" multiValued="true">
+    <analyzer type="index">
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+      <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+    <analyzer type="query">
+      <tokenizer class="solr.StandardTokenizerFactory"/>
+      <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
+      <filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
+      <filter class="solr.LowerCaseFilterFactory"/>
+    </analyzer>
+    <similarity class="solr.LMDirichletSimilarityFactory">
+      <str name="mu">2000</str>
+    </similarity>
+  </fieldType>
+
@@ -0,0 +1,27 @@
+#!/bin/bash
+curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/qaindex/config -d '{
+  "add-requesthandler": {
+    "name": "/anserini",
+    "class": "com.elsevier.asp.AnseriniRequestHandler",
+    "defaults": {
+        "sim"                       : "bm",
+        "qtype"                     : "bow",
+        "rtype"                     : "rm3",
+        "rerankCutoff"              : "50",
+        "sdm.termWeight"            : "0.85",
+        "sdm.orderedWindowWeight"   : "0.1",
+        "sdm.unorderedWindowWeight" : "0.05",
+        "rm3.fbTerms"               : "10",
+        "rm3.fbDocs"                : "10",
+        "rm3.originalQueryWeight"   : "0.5",
+        "ax.R"                      : "20",
+        "ax.N"                      : "20",
+        "ax.K"                      : "1000",
+        "ax.M"                      : "30",
+        "ax.beta"                   : "0.4",
+        "start"                     : "0",
+        "rows"                      : "10",
+        "fl"                        : "pii,isbns_f,book_title,chapter_title,para_id,para_text"
+    }
+  }
+}'
@@ -0,0 +1,59 @@
+#!/bin/bash
+curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/qaindex/schema -d '{
+  "add-field": {
+    "name": "pii",
+    "type": "string",
+    "stored": true,
+    "indexed": true
+  },
+  "add-field": {
+    "name": "isbns_f",
+    "type": "string",
+    "stored": true,
+    "indexed": false,
+    "multiValued": true
+  },
+  "add-field": {
+    "name": "isbns_u",
+    "type": "string",
+    "stored": true,
+    "indexed": true,
+    "multiValued": true
+  },
+  "add-field": {
+    "name": "book_title",
+    "type": "text_general",
+    "stored": true,
+    "indexed": true
+  },
+  "add-field": {
+    "name": "chapter_title",
+    "type": "text_general",
+    "stored": true,
+    "indexed": true
+  },
+  "add-field": {
+    "name": "para_id",
+    "type": "string",
+    "stored": true,
+    "indexed": true
+  },
+  "add-field": {
+    "name": "para_text_bm",
+    "type": "text_bm",
+    "stored": true,
+    "indexed": true,
+    "termVectors": true,
+    "termPositions": true,
+    "termOffsets": true
+  },
+  "add-field": {
+    "name": "para_text_ql",
+    "type": "text_ql",
+    "stored": true,
+    "indexed": true,
+    "termVectors": true,
+    "termPositions": true,
+    "termOffsets": true
+  }
+}'
@@ -0,0 +1,32 @@
+package com.elsevier.asp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+public class AnalyzerUtils {
+
+	public static List<String> tokenizeQuery(String queryString, String fieldName, Analyzer analyzer) {
+		List<String> queryTokens = new ArrayList<String>();
+		try {
+			TokenStream tokenStream = analyzer.tokenStream(fieldName, queryString);
+			CharTermAttribute termAttr = tokenStream.getAttribute(CharTermAttribute.class);
+			tokenStream.reset();
+			while (tokenStream.incrementToken()) {
+				String token = termAttr.toString();
+				if (token.length() == 0) continue;
+				queryTokens.add(token);
+			}
+			tokenStream.end();
+			tokenStream.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		return queryTokens;
+	}
+
+}