diff --git a/docs/api.md b/docs/api.md index df80a9e..e42649c 100644 --- a/docs/api.md +++ b/docs/api.md @@ -103,6 +103,7 @@ Annotates text against a specified lexicon and match type. Match type can be one * __punct__ - removes punctuation and phrase chunks input, then matches phrases against lexicon entries. Also case insensitive. * __sort__ - same as punct, except that words in phrase chunks are sorted and matched against similarly sorted lexicon entries. * __stem__ - same as sort, except words in phrases are stemmed using Porter stemmer and matched against similarly stemmed lexicon entries. +* __fuzzy__ - matches texts by creating fuzzy Solr query, the level of fuzziness can be controlled with number of characters. The similar the word, the higher the confidence. __URL__ http://host:port/soda/annot.json @@ -139,6 +140,47 @@ __OUTPUT__ ] ```` +__FUZZY SEARCH__ + +Default value of fuzziness is 1, but any integer can be passed , which will indicate number of fuzzy characters being considered. + +__INPUT__ + +````json + { + "lexicon":"gender", + "text":"Mayer made Fortune magazine history in 2013, Most Powerful women , and 40 Under 40 at the same time", + "matching":"fuzzy", + "fuzziness":1 + } +```` + +__OUTPUT__ + +````json + [ + { + "matchedText": "male", + "begin": 6, + "lexicon": "gender", + "confidence": "0.75", + "id": "man", + "coveredText": "made", + "end": 9 + }, + { + "matchedText": "women", + "begin": 59, + "lexicon": "gender", + "confidence": "1.0", + "id": "female", + "coveredText": "women", + "end": 64 + } + ] +```` + + __EXAMPLE PYTHON CLIENT__ ````python diff --git a/src/main/scala/com/elsevier/soda/AnnotationHelper.scala b/src/main/scala/com/elsevier/soda/AnnotationHelper.scala index 0f169ea..30eb687 100644 --- a/src/main/scala/com/elsevier/soda/AnnotationHelper.scala +++ b/src/main/scala/com/elsevier/soda/AnnotationHelper.scala @@ -12,6 +12,7 @@ object AnnotationHelper { val CoveredText = "covered" val Confidence = "conf" val Lexicon = "lexicon" + val MatchedText = "matchedText" def confToStr(conf: Double) = "%.3f".format(conf) diff --git a/src/main/scala/com/elsevier/soda/SodaController.scala b/src/main/scala/com/elsevier/soda/SodaController.scala index 7679e3e..0464b51 100644 --- a/src/main/scala/com/elsevier/soda/SodaController.scala +++ b/src/main/scala/com/elsevier/soda/SodaController.scala @@ -42,11 +42,12 @@ class SodaController @Autowired() (sodaService: SodaService) { val lexicon = params.getOrElse("lexicon", "").asInstanceOf[String] val text = params.getOrElse("text", "").asInstanceOf[String] val matchFlag = params.getOrElse("matching", "exact").asInstanceOf[String] + val fuzziness = params.getOrElse("fuzziness", 1.0).asInstanceOf[Double] if (lexicon.isEmpty || text.isEmpty) { model.addAttribute("response", SodaUtils.error("Both Lexicon and Text must be specified!")) } else { - val annots = sodaService.annotate(text, lexicon, matchFlag) + val annots = sodaService.annotate(text, lexicon, matchFlag, fuzziness.toInt) model.addAttribute("response", sodaService.annotJson(annots)) } "annotate" diff --git a/src/main/scala/com/elsevier/soda/SodaService.scala b/src/main/scala/com/elsevier/soda/SodaService.scala index c9d6fe1..c26906f 100644 --- a/src/main/scala/com/elsevier/soda/SodaService.scala +++ b/src/main/scala/com/elsevier/soda/SodaService.scala @@ -3,7 +3,7 @@ package com.elsevier.soda import scala.collection.JavaConversions.asScalaBuffer import scala.collection.JavaConversions.asScalaIterator import scala.collection.JavaConversions.collectionAsScalaIterable -import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.{ArrayBuffer, HashMap} import org.apache.commons.lang3.StringUtils import org.apache.solr.client.solrj.SolrRequest import org.apache.solr.client.solrj.impl.HttpSolrClient @@ -17,9 +17,12 @@ import org.apache.solr.common.util.NamedList import org.springframework.stereotype.Service import org.apache.solr.client.solrj.impl.HttpSolrClient.RemoteSolrException import java.util.Collections + import org.apache.solr.common.util.ContentStream import org.apache.solr.common.SolrInputDocument import java.util.regex.Pattern + +import org.apache.solr.client.solrj.util.ClientUtils import com.aliasi.chunk.RegExChunker case class DictInfo(dictName: String, numEntries: Long) @@ -39,7 +42,7 @@ class SodaService { val sodaClient = new SodaClient() def annotate(text: String, lexiconName: String, - matchFlag: String): List[Annotation] = { + matchFlag: String, fuzziness: Int = 1): List[Annotation] = { val lexName = if (lexiconName.endsWith("-full")) lexiconName.substring(0, lexiconName.length() - 5) else lexiconName @@ -49,6 +52,7 @@ class SodaService { case "punct" => chunkAndTag(text, lexName, "tagname_nrm") case "sort" => chunkAndTag(text, lexName, "tagname_srt") case "stem" => chunkAndTag(text, lexName, "tagname_stm") + case "fuzzy" => fuzzySearch(text, lexName, fuzziness) case _ => List() } if (lexiconName.endsWith("-full")) annotations @@ -183,6 +187,42 @@ class SodaService { coveredText.trim().length() > 3 && !stopwords.contains(coveredText) }).toList } + + def fuzzySearch(text: String, lexName: String, fuzzinessValue: Int): List[Annotation] = { + val words = text.split(" ") + // run each of these phrases against + val tags = ArrayBuffer[Annotation]() + var currentStart = 0 + words.foreach(word => { + val params = new ModifiableSolrParams() + params.add(CommonParams.Q, "tagname_str:" + ClientUtils.escapeQueryChars(word) + "~" + fuzzinessValue) + params.add(CommonParams.ROWS, "1") + params.add(CommonParams.FQ, buildFq(lexName, false)) + params.add(CommonParams.FL, "id,tagname_str") + + val resp = querySolr.query(params) + val results = resp.getResults() + if (results.getNumFound() > 0) { + val sdoc = results.get(0) + val id = sdoc.getFieldValue("id").asInstanceOf[String] + val names = sdoc.getFieldValues("tagname_str") + .map(_.asInstanceOf[String]) + .toList + val (confidence, matchedName) = bestScoreWithName(word, names) + tags += (Annotation("lx", id, currentStart, currentStart+ word.length - 1, + Map(AnnotationHelper.CoveredText -> word, + AnnotationHelper.Confidence -> confidence.toString, + AnnotationHelper.Lexicon -> lexName, + AnnotationHelper.MatchedText -> matchedName))) + } + currentStart = currentStart+ word.length + 1 + }) + tags.filter(annot => { + val coveredText = annot.props(AnnotationHelper.CoveredText) + .toLowerCase() + !stopwords.contains(coveredText) + }).toList + } def buildFq(tagtype: String, lowerCaseInput: Boolean): String = { val tagSubtypeQuery = Array("tagsubtype", if (lowerCaseInput) "l" else "x") @@ -205,6 +245,21 @@ class SodaService { else if (score > matchedSpan.length()) 0.0D else (1.0D - (1.0D * score / matchedSpan.length())) } + + def bestScoreWithName(matchedSpan: String, names: List[String]) = { + val scoreName = new HashMap[Int,String]() + val score = names.map(name => { + scoreName.put(StringUtils.getLevenshteinDistance(matchedSpan, name), name) + StringUtils.getLevenshteinDistance(matchedSpan, name)}) + .sorted + .head + if (matchedSpan.length() == 0) (0.0D, null) + else if (score > matchedSpan.length()) (0.0D, null) + else { + val confidenceScore = (1.0D - (1.0D * score / matchedSpan.length())) + (confidenceScore, scoreName(score)) + } + } def getDictInfo(): List[DictInfo] = { val params = new ModifiableSolrParams()