Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ Annotates text against a specified lexicon and match type. Match type can be one
* __punct__ - removes punctuation and phrase chunks input, then matches phrases against lexicon entries. Also case insensitive.
* __sort__ - same as punct, except that words in phrase chunks are sorted and matched against similarly sorted lexicon entries.
* __stem__ - same as sort, except words in phrases are stemmed using Porter stemmer and matched against similarly stemmed lexicon entries.
* __fuzzy__ - matches texts by creating fuzzy Solr query, the level of fuzziness can be controlled with number of characters. The similar the word, the higher the confidence.

__URL__ http://host:port/soda/annot.json

Expand Down Expand Up @@ -139,6 +140,47 @@ __OUTPUT__
]
````

__FUZZY SEARCH__

Default value of fuzziness is 1, but any integer can be passed , which will indicate number of fuzzy characters being considered.

__INPUT__

````json
{
"lexicon":"gender",
"text":"Mayer made Fortune magazine history in 2013, Most Powerful women , and 40 Under 40 at the same time",
"matching":"fuzzy",
"fuzziness":1
}
````

__OUTPUT__

````json
[
{
"matchedText": "male",
"begin": 6,
"lexicon": "gender",
"confidence": "0.75",
"id": "man",
"coveredText": "made",
"end": 9
},
{
"matchedText": "women",
"begin": 59,
"lexicon": "gender",
"confidence": "1.0",
"id": "female",
"coveredText": "women",
"end": 64
}
]
````


__EXAMPLE PYTHON CLIENT__

````python
Expand Down
1 change: 1 addition & 0 deletions src/main/scala/com/elsevier/soda/AnnotationHelper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ object AnnotationHelper {
val CoveredText = "covered"
val Confidence = "conf"
val Lexicon = "lexicon"
val MatchedText = "matchedText"

def confToStr(conf: Double) = "%.3f".format(conf)

Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/com/elsevier/soda/SodaController.scala
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,12 @@ class SodaController @Autowired() (sodaService: SodaService) {
val lexicon = params.getOrElse("lexicon", "").asInstanceOf[String]
val text = params.getOrElse("text", "").asInstanceOf[String]
val matchFlag = params.getOrElse("matching", "exact").asInstanceOf[String]
val fuzziness = params.getOrElse("fuzziness", 1.0).asInstanceOf[Double]
if (lexicon.isEmpty || text.isEmpty) {
model.addAttribute("response",
SodaUtils.error("Both Lexicon and Text must be specified!"))
} else {
val annots = sodaService.annotate(text, lexicon, matchFlag)
val annots = sodaService.annotate(text, lexicon, matchFlag, fuzziness.toInt)
model.addAttribute("response", sodaService.annotJson(annots))
}
"annotate"
Expand Down
58 changes: 56 additions & 2 deletions src/main/scala/com/elsevier/soda/SodaService.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package com.elsevier.soda
import scala.collection.JavaConversions.asScalaBuffer
import scala.collection.JavaConversions.asScalaIterator
import scala.collection.JavaConversions.collectionAsScalaIterable
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.{ArrayBuffer, HashMap}
import org.apache.commons.lang3.StringUtils
import org.apache.solr.client.solrj.SolrRequest
import org.apache.solr.client.solrj.impl.HttpSolrClient
Expand All @@ -17,9 +17,11 @@ import org.apache.solr.common.util.NamedList
import org.springframework.stereotype.Service
import org.apache.solr.client.solrj.impl.HttpSolrClient.RemoteSolrException
import java.util.Collections

import org.apache.solr.common.util.ContentStream
import org.apache.solr.common.SolrInputDocument
import java.util.regex.Pattern

import com.aliasi.chunk.RegExChunker

case class DictInfo(dictName: String, numEntries: Long)
Expand All @@ -39,7 +41,7 @@ class SodaService {
val sodaClient = new SodaClient()

def annotate(text: String, lexiconName: String,
matchFlag: String): List[Annotation] = {
matchFlag: String, fuzziness: Int = 1): List[Annotation] = {
val lexName = if (lexiconName.endsWith("-full"))
lexiconName.substring(0, lexiconName.length() - 5)
else lexiconName
Expand All @@ -49,6 +51,7 @@ class SodaService {
case "punct" => chunkAndTag(text, lexName, "tagname_nrm")
case "sort" => chunkAndTag(text, lexName, "tagname_srt")
case "stem" => chunkAndTag(text, lexName, "tagname_stm")
case "fuzzy" => fuzzySearch(text, lexName, fuzziness)
case _ => List()
}
if (lexiconName.endsWith("-full")) annotations
Expand Down Expand Up @@ -183,6 +186,42 @@ class SodaService {
coveredText.trim().length() > 3 && !stopwords.contains(coveredText)
}).toList
}

def fuzzySearch(text: String, lexName: String, fuzzinessValue: Int): List[Annotation] = {
val words = text.split(" ")
// run each of these phrases against
val tags = ArrayBuffer[Annotation]()
var currentStart = 0
words.foreach(word => {
val params = new ModifiableSolrParams()
params.add(CommonParams.Q, "tagname_str:" + word + "~" + fuzzinessValue)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"word" ought to be escaped. Use org.apache.solr.client.solrj.util.ClientUtils#escapeQueryChars

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dsmiley done.

params.add(CommonParams.ROWS, "1")
params.add(CommonParams.FQ, buildFq(lexName, false))
params.add(CommonParams.FL, "id,tagname_str")

val resp = querySolr.query(params)
val results = resp.getResults()
if (results.getNumFound() > 0) {
val sdoc = results.get(0)
val id = sdoc.getFieldValue("id").asInstanceOf[String]
val names = sdoc.getFieldValues("tagname_str")
.map(_.asInstanceOf[String])
.toList
val (confidence, matchedName) = bestScoreWithName(word, names)
tags += (Annotation("lx", id, currentStart, currentStart+ word.length - 1,
Map(AnnotationHelper.CoveredText -> word,
AnnotationHelper.Confidence -> confidence.toString,
AnnotationHelper.Lexicon -> lexName,
AnnotationHelper.MatchedText -> matchedName)))
}
currentStart = currentStart+ word.length + 1
})
tags.filter(annot => {
val coveredText = annot.props(AnnotationHelper.CoveredText)
.toLowerCase()
!stopwords.contains(coveredText)
}).toList
}

def buildFq(tagtype: String, lowerCaseInput: Boolean): String = {
val tagSubtypeQuery = Array("tagsubtype", if (lowerCaseInput) "l" else "x")
Expand All @@ -205,6 +244,21 @@ class SodaService {
else if (score > matchedSpan.length()) 0.0D
else (1.0D - (1.0D * score / matchedSpan.length()))
}

def bestScoreWithName(matchedSpan: String, names: List[String]) = {
val scoreName = new HashMap[Int,String]()
val score = names.map(name => {
scoreName.put(StringUtils.getLevenshteinDistance(matchedSpan, name), name)
StringUtils.getLevenshteinDistance(matchedSpan, name)})
.sorted
.head
if (matchedSpan.length() == 0) (0.0D, null)
else if (score > matchedSpan.length()) (0.0D, null)
else {
val confidenceScore = (1.0D - (1.0D * score / matchedSpan.length()))
(confidenceScore, scoreName(score))
}
}

def getDictInfo(): List[DictInfo] = {
val params = new ModifiableSolrParams()
Expand Down