elsevierlabs-os · navd · Mar 22, 2018 · Mar 22, 2018 · dsmiley · Mar 22, 2018
diff --git a/docs/api.md b/docs/api.md
@@ -103,6 +103,7 @@ Annotates text against a specified lexicon and match type. Match type can be one
 * __punct__ - removes punctuation and phrase chunks input, then matches phrases against lexicon entries. Also case insensitive.
 * __sort__ - same as punct, except that words in phrase chunks are sorted and matched against similarly sorted lexicon entries.
 * __stem__ - same as sort, except words in phrases are stemmed using Porter stemmer and matched against similarly stemmed lexicon entries.
+* __fuzzy__ - matches texts by creating fuzzy Solr query, the level of fuzziness can be controlled with number of characters. The similar the word, the higher the confidence.
 
 __URL__ http://host:port/soda/annot.json
 
@@ -139,6 +140,47 @@ __OUTPUT__
     ]
 ````
 
+__FUZZY SEARCH__
+
+Default value of fuzziness is 1, but any integer can be passed , which will indicate number of fuzzy characters being considered.
+
+__INPUT__
+
+````json
+    {
+        "lexicon":"gender",
+        "text":"Mayer made Fortune magazine history in 2013, Most Powerful women , and 40 Under 40 at the same time",
+        "matching":"fuzzy",
+        "fuzziness":1
+    }
+````
+
+__OUTPUT__
+
+````json
+    [
+        {
+            "matchedText": "male",
+            "begin": 6,
+            "lexicon": "gender",
+            "confidence": "0.75",
+            "id": "man",
+            "coveredText": "made",
+            "end": 9
+        },
+        {
+            "matchedText": "women",
+            "begin": 59,
+            "lexicon": "gender",
+            "confidence": "1.0",
+            "id": "female",
+            "coveredText": "women",
+            "end": 64
+        }
+    ]
+````
+
+
 __EXAMPLE PYTHON CLIENT__
 
 ````python

diff --git a/src/main/scala/com/elsevier/soda/AnnotationHelper.scala b/src/main/scala/com/elsevier/soda/AnnotationHelper.scala
@@ -12,6 +12,7 @@ object AnnotationHelper {
     val CoveredText = "covered"
     val Confidence = "conf"
     val Lexicon = "lexicon"
+    val MatchedText = "matchedText"
 
     def confToStr(conf: Double) = "%.3f".format(conf)
 

diff --git a/src/main/scala/com/elsevier/soda/SodaController.scala b/src/main/scala/com/elsevier/soda/SodaController.scala
@@ -42,11 +42,12 @@ class SodaController @Autowired() (sodaService: SodaService) {
         val lexicon = params.getOrElse("lexicon", "").asInstanceOf[String]
         val text = params.getOrElse("text", "").asInstanceOf[String]
         val matchFlag = params.getOrElse("matching", "exact").asInstanceOf[String]
+        val fuzziness = params.getOrElse("fuzziness", 1.0).asInstanceOf[Double]
         if (lexicon.isEmpty || text.isEmpty) {
             model.addAttribute("response", 
                 SodaUtils.error("Both Lexicon and Text must be specified!"))
         } else {
-            val annots = sodaService.annotate(text, lexicon, matchFlag)
+            val annots = sodaService.annotate(text, lexicon, matchFlag, fuzziness.toInt)
             model.addAttribute("response", sodaService.annotJson(annots))
         }
         "annotate"

diff --git a/src/main/scala/com/elsevier/soda/SodaService.scala b/src/main/scala/com/elsevier/soda/SodaService.scala
@@ -3,7 +3,7 @@ package com.elsevier.soda
 import scala.collection.JavaConversions.asScalaBuffer
 import scala.collection.JavaConversions.asScalaIterator
 import scala.collection.JavaConversions.collectionAsScalaIterable
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.{ArrayBuffer, HashMap}
 import org.apache.commons.lang3.StringUtils
 import org.apache.solr.client.solrj.SolrRequest
 import org.apache.solr.client.solrj.impl.HttpSolrClient
@@ -17,9 +17,11 @@ import org.apache.solr.common.util.NamedList
 import org.springframework.stereotype.Service
 import org.apache.solr.client.solrj.impl.HttpSolrClient.RemoteSolrException
 import java.util.Collections
+
 import org.apache.solr.common.util.ContentStream
 import org.apache.solr.common.SolrInputDocument
 import java.util.regex.Pattern
+
 import com.aliasi.chunk.RegExChunker
 
 case class DictInfo(dictName: String, numEntries: Long)
@@ -39,7 +41,7 @@ class SodaService {
     val sodaClient = new SodaClient()
 
     def annotate(text: String, lexiconName: String, 
-            matchFlag: String): List[Annotation] = {
+            matchFlag: String, fuzziness: Int = 1): List[Annotation] = {
         val lexName = if (lexiconName.endsWith("-full")) 
                           lexiconName.substring(0, lexiconName.length() - 5)
                       else lexiconName
@@ -49,6 +51,7 @@ class SodaService {
             case "punct" => chunkAndTag(text, lexName, "tagname_nrm")
             case "sort" => chunkAndTag(text, lexName, "tagname_srt")
             case "stem" => chunkAndTag(text, lexName, "tagname_stm")
+            case "fuzzy" => fuzzySearch(text, lexName, fuzziness)
             case _ => List()
         }
         if (lexiconName.endsWith("-full")) annotations
@@ -183,6 +186,42 @@ class SodaService {
             coveredText.trim().length() > 3 && !stopwords.contains(coveredText)    
         }).toList
     }
+
+    def fuzzySearch(text: String, lexName: String, fuzzinessValue: Int): List[Annotation] = {
+        val words = text.split(" ")
+        // run each of these phrases against
+        val tags = ArrayBuffer[Annotation]()
+        var currentStart = 0
+        words.foreach(word => {
+            val params = new ModifiableSolrParams()
+            params.add(CommonParams.Q, "tagname_str:" + word + "~" + fuzzinessValue)
+            params.add(CommonParams.ROWS, "1")
+            params.add(CommonParams.FQ, buildFq(lexName, false))
+            params.add(CommonParams.FL, "id,tagname_str")
+
+            val resp = querySolr.query(params)
+            val results = resp.getResults()
+            if (results.getNumFound() > 0) {
+                val sdoc = results.get(0)
+                val id = sdoc.getFieldValue("id").asInstanceOf[String]
+                val names = sdoc.getFieldValues("tagname_str")
+                  .map(_.asInstanceOf[String])
+                  .toList
+                val (confidence, matchedName) = bestScoreWithName(word, names)
+                tags += (Annotation("lx", id, currentStart, currentStart+ word.length - 1,
+                    Map(AnnotationHelper.CoveredText -> word,
+                        AnnotationHelper.Confidence -> confidence.toString,
+                        AnnotationHelper.Lexicon -> lexName,
+                        AnnotationHelper.MatchedText -> matchedName)))
+            }
+            currentStart = currentStart+ word.length + 1
+        })
+        tags.filter(annot => {
+            val coveredText = annot.props(AnnotationHelper.CoveredText)
+              .toLowerCase()
+            !stopwords.contains(coveredText)
+        }).toList
+    }
 
     def buildFq(tagtype: String, lowerCaseInput: Boolean): String = {
         val tagSubtypeQuery = Array("tagsubtype", if (lowerCaseInput) "l" else "x")
@@ -205,6 +244,21 @@ class SodaService {
         else if (score > matchedSpan.length()) 0.0D
         else (1.0D - (1.0D * score / matchedSpan.length()))                               
     }
+
+    def bestScoreWithName(matchedSpan: String, names: List[String]) = {
+        val scoreName = new HashMap[Int,String]()
+        val score = names.map(name => {
+            scoreName.put(StringUtils.getLevenshteinDistance(matchedSpan, name), name)
+            StringUtils.getLevenshteinDistance(matchedSpan, name)})
+          .sorted
+          .head
+        if (matchedSpan.length() == 0) (0.0D, null)
+        else if (score > matchedSpan.length()) (0.0D, null)
+        else {
+            val confidenceScore = (1.0D - (1.0D * score / matchedSpan.length()))
+            (confidenceScore, scoreName(score))
+        }
+    }
 
     def getDictInfo(): List[DictInfo] = {
         val params = new ModifiableSolrParams()