Skip to content

Commit 731b95d

Browse files
authored
Better support for elasticsearch characters (#46)
* Support colons in search box & metadata search * remove logging * Add support for other operators in search api & search box * Clean up != behavior, include gte lte in UI * Clean up regex and parsing for exists and missing queries * remove space sensitivity to searches * improve documentation * Fixed spacing check * handle colons on contains matches better * Add numeric detection to mapping * disable lt gt lte gte * update help syntax * Simplify help * colon warning
1 parent 821aa10 commit 731b95d

File tree

4 files changed

+149
-105
lines changed

4 files changed

+149
-105
lines changed

app/services/ElasticsearchPlugin.scala

Lines changed: 120 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import models.{Collection, Dataset, ElasticsearchResult, File, Folder, ResourceR
2727
import play.api.Play.current
2828
import play.api.libs.json._
2929
import _root_.util.SearchUtils
30+
import org.apache.commons.lang.StringUtils
3031
import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequest
3132

3233

@@ -48,7 +49,8 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
4849
val nameOfIndex = play.api.Play.configuration.getString("elasticsearchSettings.indexNamePrefix").getOrElse("clowder")
4950
val maxResults = play.api.Play.configuration.getInt("elasticsearchSettings.maxResults").getOrElse(240)
5051

51-
val mustOperators = List("==", "<", ">", ":")
52+
// TODO: Removed gt lt gte lte operators until numeric_detection can be enabled on the dynamic mapper
53+
val mustOperators = List("==", ":") // "<=", ">=", "<", ">", ":")
5254
val mustNotOperators = List("!=")
5355

5456

@@ -130,53 +132,37 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
130132
accumulatePageResult(queryObj, user, from.getOrElse(0), size.getOrElse(maxResults))
131133
}
132134

133-
/** Search using a simple text string, appending parameters from API to string if provided */
135+
/**
136+
* Search using a simple text string.
137+
* The API endpoint supports several parameters like datasetid that are translated and appended to the query first.
138+
* @param query
139+
* @param resource_type - Restrict to particular resource_type
140+
* @param datasetid - Restrict to particular dataset ID (only returns files)
141+
* @param collectionid - Restrict to particular collection ID
142+
* @param spaceid - Restrict to particular space ID
143+
* @param folderid - Restrict to particular folder ID
144+
* @param field - Restrict to a specific metadata field (assumes query is the value)
145+
* @param tag - Restrict to a particular tag (exact match)
146+
* @param from
147+
* @param size
148+
* @param permitted
149+
* @param user
150+
* @param index
151+
*/
134152
def search(query: String, resource_type: Option[String], datasetid: Option[String], collectionid: Option[String],
135153
spaceid: Option[String], folderid: Option[String], field: Option[String], tag: Option[String],
136154
from: Option[Int], size: Option[Int], permitted: List[UUID], user: Option[User],
137155
index: String = nameOfIndex): ElasticsearchResult = {
138156

157+
// Convert any parameters from API into the query syntax equivalent so we can parse it all together later
139158
var expanded_query = query
140-
141-
// whether to restrict to a particular metadata field, or search all fields (including tags, name, etc.)
142-
val mdfield = field match {
143-
case Some(k) => expanded_query = " "+k+":\""+expanded_query+"\""
144-
case None => {}
145-
}
146-
147-
// Restrict to a particular tag - currently requires exact match
148-
tag match {
149-
case Some(t) => expanded_query += " tag:"+t
150-
case None => {}
151-
}
152-
153-
// Restrict to particular resource_type if requested
154-
resource_type match {
155-
case Some(restype) => expanded_query += " resource_type:"+restype
156-
case None => {}
157-
}
158-
159-
// Restrict to particular dataset ID (only return files)
160-
datasetid match {
161-
case Some(dsid) => expanded_query += " in:"+dsid+" resource_type:file"
162-
case None => {}
163-
}
164-
165-
// Restrict to particular collection ID
166-
collectionid match {
167-
case Some(cid) => expanded_query += " in:"+cid
168-
case None => {}
169-
}
170-
171-
spaceid match {
172-
case Some(spid) => expanded_query += " in:"+spid
173-
case None => {}
174-
}
175-
176-
folderid match {
177-
case Some(fid) => expanded_query += " in:"+fid
178-
case None => {}
179-
}
159+
field.foreach(k => expanded_query = " "+k+":\""+expanded_query+"\"")
160+
tag.foreach(t => expanded_query += s" tag:$t")
161+
resource_type.foreach(restype => expanded_query += s" resource_type:$restype")
162+
datasetid.foreach(dsid => expanded_query += s" in:$dsid resource_type:file")
163+
collectionid.foreach(cid => expanded_query += s" in:$cid")
164+
spaceid.foreach(spid => expanded_query += s" in:$spid")
165+
folderid.foreach(fid => expanded_query += s" in:$fid")
180166

181167
val queryObj = prepareElasticJsonQuery(expanded_query.stripPrefix(" "), permitted, user)
182168
accumulatePageResult(queryObj, user, from.getOrElse(0), size.getOrElse(maxResults))
@@ -291,7 +277,6 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
291277
}).toList, response.getHits().getTotalHits())
292278
}
293279

294-
295280
/** Create a new index with preconfigured mappgin */
296281
def createIndex(index: String = nameOfIndex): Unit = {
297282
val indexSettings = Settings.settingsBuilder().loadFromSource(jsonBuilder()
@@ -706,10 +691,12 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
706691

707692
/** Return string-encoded JSON object describing field types */
708693
def getElasticsearchObjectMappings(): String = {
694+
709695
/** The dynamic template will restrict all dynamic metadata fields to be indexed
710696
* as strings for datatypes besides Objects. In the future, this could
711697
* be removed, but only once the Search API better supports those data types (e.g. Date).
712698
*/
699+
// TODO: Enable "numeric_detection": true alongside date_detection
713700
"""{"clowder_object": {
714701
|"date_detection": false,
715702
|"properties": {
@@ -737,17 +724,27 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
737724

738725
/** Create appropriate search object based on operator */
739726
def parseMustOperators(builder: XContentBuilder, key: String, value: String, operator: String): XContentBuilder = {
740-
// TODO: Suppert lte, gte (<=, >=)
741727
operator match {
742728
case "==" => builder.startObject().startObject("match_phrase").field(key, value).endObject().endObject()
743729
case "<" => builder.startObject().startObject("range").startObject(key).field("lt", value).endObject().endObject().endObject()
744730
case ">" => builder.startObject().startObject("range").startObject(key).field("gt", value).endObject().endObject().endObject()
731+
case "<=" => builder.startObject().startObject("range").startObject(key).field("lte", value).endObject().endObject().endObject()
732+
case ">=" => builder.startObject().startObject("range").startObject(key).field("gte", value).endObject().endObject().endObject()
745733
case ":" => {
746734
if (key == "_all")
747735
builder.startObject().startObject("regexp").field("_all", wrapRegex(value)).endObject().endObject()
748-
else
736+
else if (key == "exists") {
737+
val cleaned = if (!value.startsWith("metadata.")) "metadata."+value else value
738+
builder.startObject().startObject("exists").field("field", cleaned).endObject().endObject()
739+
} else if (key == "missing") {
740+
val cleaned = if (!value.startsWith("metadata.")) "metadata."+value else value
741+
builder.startObject().startObject("bool").startArray("must_not").startObject()
742+
.startObject("exists").field("field", cleaned).endObject().endObject().endArray().endObject().endObject()
743+
} else {
744+
val cleaned = value.replace(":", "\\:") // Colons have special meaning in query_string
749745
builder.startObject().startObject("query_string").field("default_field", key)
750-
.field("query", "\""+value+"\"").endObject().endObject()
746+
.field("query", cleaned).endObject().endObject()
747+
}
751748
}
752749
case _ => {}
753750
}
@@ -763,7 +760,7 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
763760
builder
764761
}
765762

766-
/**Convert list of search term JsValues into an Elasticsearch-ready JSON query object**/
763+
/** Convert list of search term JsValues into an Elasticsearch-ready JSON query object **/
767764
def prepareElasticJsonQuery(query: List[JsValue], grouping: String): XContentBuilder = {
768765
/** OPERATORS
769766
* : contains (partial match)
@@ -792,7 +789,7 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
792789
builder.startArray("should").startObject().startObject("bool")
793790

794791
// 2) populate the MUST/SHOULD portion
795-
if (mustList.length > 0) {
792+
if (mustList.length > 0 || mustNotList.length > 0) {
796793
grouping match {
797794
case "AND" => builder.startArray("must")
798795
case "OR" => builder.startArray("should")
@@ -803,6 +800,13 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
803800
val value = (jv \ "field_value").toString.replace("\"", "")
804801
builder = parseMustOperators(builder, key, value, operator)
805802
})
803+
804+
// Also add != fields to MUST EXISTS query so we don't return all documents without those fields too
805+
mustNotList.foreach(jv => {
806+
val key = (jv \ "field_key").toString.replace("\"","")
807+
builder.startObject().startObject("exists").field("field", key).endObject().endObject()
808+
})
809+
806810
builder.endArray()
807811
}
808812

@@ -838,27 +842,37 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
838842
builder
839843
}
840844

841-
/**Convert search string into an Elasticsearch-ready JSON query object**/
845+
/** Convert search string into an Elasticsearch-ready JSON query object **/
842846
def prepareElasticJsonQuery(query: String, permitted: List[UUID], user: Option[User]): XContentBuilder = {
843-
/** OPERATORS
844-
* == equals (exact match)
845-
* != not equals (partial matches OK)
846-
* < less than
847-
* > greater than
848-
**/
849847

850848
// Use regex to split string into a list, preserving quoted phrases as single value
851849
val matches = ListBuffer[String]()
852-
val m = Pattern.compile("([^\"]\\S*|\".+?\")\\s*").matcher(query)
850+
val m = Pattern.compile("([^\":= ]+|\".+?\")").matcher(query)
851+
//val m = Pattern.compile("([^\":=<> ]+|\".+?\")").matcher(query)
853852
while (m.find()) {
854-
var mat = m.group(1).replace("\"", "").replace("__", " ")
855-
if (mat.startsWith(":")) mat = mat.substring(1)
856-
if (mat.endsWith(":")) mat = mat.substring(0, mat.length-2)
857-
matches += mat
853+
var mat = m.group(1).replace("\"", "").replace("__", " ").trim
854+
if (mat.length>0) {
855+
// Remove operators from terms e.g. <=value becomes value
856+
(mustOperators ::: mustNotOperators).foreach(op => {
857+
if (mat.startsWith(op)) {
858+
// Make sure x<=4 is "x lte 4" not "x lt =4"
859+
var foundLonger = false
860+
(mustOperators ::: mustNotOperators).foreach(longerop => {
861+
if (longerop!=op && longerop.length>op.length && mat.startsWith(longerop)) {
862+
mat = mat.substring(longerop.length)
863+
foundLonger = true
864+
}
865+
})
866+
if (!foundLonger)
867+
mat = mat.substring(op.length)
868+
}
869+
})
870+
matches += mat
871+
}
858872
}
859873

860874
// If a term is specified that isn't in this list, it's assumed to be a metadata field
861-
val official_terms = List("name", "creator", "email", "resource_type", "in", "contains", "tag")
875+
val official_terms = List("name", "creator", "email", "resource_type", "in", "contains", "tag", "exists", "missing")
862876

863877
// Create list of (key, operator, value) for passing to builder
864878
val terms = ListBuffer[(String, String, String)]()
@@ -867,36 +881,44 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
867881
var currval = ""
868882
matches.foreach(mt => {
869883
// Check if the current term appears before or after one of the operators, and what operator is
870-
var entryType = "value"
871-
if (query.contains(mt+":") || query.contains("\""+mt+"\":")) {
872-
entryType = "key"
873-
}
884+
var entryType = "unknown"
885+
(mustOperators ::: mustNotOperators).foreach(op => {
886+
val reducedSpaces = StringUtils.normalizeSpace(query)
887+
if ((reducedSpaces.contains(mt+op) || reducedSpaces.contains("\""+mt+"\""+op) ||
888+
reducedSpaces.contains(mt+" "+op) || reducedSpaces.contains("\""+mt+"\" "+op)) && entryType=="unknown") {
889+
entryType = "key"
890+
curropr = op
891+
} else if (reducedSpaces.contains(op+mt) || reducedSpaces.contains(op+"\""+mt+"\"") ||
892+
reducedSpaces.contains(op+" "+mt) || reducedSpaces.contains(op+" \""+mt+"\"")) {
893+
entryType = "value"
894+
curropr = op
895+
}
896+
})
897+
if (entryType=="unknown") entryType = "value"
874898

875899
// Determine if the string was a key or value
876900
if (entryType == "key") {
877901
// Do some user-friendly replacement
878-
if (mt == "tag")
879-
currkey = "tags"
880-
else if (mt == "in")
881-
currkey = "child_of"
882-
else if (mt == "contains")
883-
currkey = "parent_of"
884-
else if (mt == "creator")
885-
currkey = "creator_name"
886-
else if (mt == "email")
887-
currkey = "creator_email"
888-
else if (!official_terms.contains(mt))
889-
currkey = "metadata."+mt
902+
if (mt == "tag") currkey = "tags"
903+
else if (mt == "in") currkey = "child_of"
904+
else if (mt == "contains") currkey = "parent_of"
905+
else if (mt == "creator") currkey = "creator_name"
906+
else if (mt == "email") currkey = "creator_email"
907+
else if (!official_terms.contains(mt)) currkey = "metadata."+mt
890908
else
891909
currkey = mt
892910
} else if (entryType == "value") {
893-
currval += mt.toLowerCase()
911+
if (currkey!="exists" && currkey!="missing")
912+
currval = mt.toLowerCase()
913+
else currval= mt
894914
terms += ((currkey, curropr, currval))
895915
currkey = "_all"
916+
curropr = ":"
896917
currval = ""
897918
}
898919
})
899920

921+
// Now that we have a nicely structured list of (key, operator, value) tuples we can translate to Elastic objects
900922
var builder = jsonBuilder().startObject().startObject("bool")
901923

902924
// First, populate the MUST portion of Bool query
@@ -905,16 +927,22 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
905927
val key = entry._1
906928
val curropr = entry._2
907929
val value = entry._3
908-
for (operator <- mustOperators) {
909-
if (curropr == operator) {
910-
// Only add a MUST object if we have terms to populate it; empty objects break Elasticsearch
911-
if (mustOperators.contains(operator) && !populatedMust) {
912-
builder.startArray("must")
913-
populatedMust = true
914-
}
930+
if (mustOperators.contains(curropr)) {
931+
// Only add a MUST object if we have terms to populate it; empty objects break Elasticsearch
932+
if (!populatedMust) {
933+
builder.startArray("must")
934+
populatedMust = true
935+
}
936+
builder = parseMustOperators(builder, key, value, curropr)
937+
}
915938

916-
builder = parseMustOperators(builder, key, value, operator)
939+
// For != operators, include an EXISTS query to avoid returning all documents without that field
940+
if (mustNotOperators.contains(curropr)) {
941+
if (!populatedMust) {
942+
builder.startArray("must")
943+
populatedMust = true
917944
}
945+
builder.startObject().startObject("exists").field("field", key).endObject().endObject()
918946
}
919947
})
920948

@@ -954,16 +982,13 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
954982
val key = entry._1
955983
val curropr = entry._2
956984
val value = entry._3
957-
for (operator <- mustNotOperators) {
958-
if (curropr == operator) {
959-
// Only add a MUST object if we have terms to populate it; empty objects break Elasticsearch
960-
if (mustNotOperators.contains(operator) && !populatedMustNot) {
961-
builder.startArray("must_not")
962-
populatedMustNot = true
963-
}
964-
965-
builder = parseMustNotOperators(builder, key, value, operator)
985+
if (mustNotOperators.contains(curropr)) {
986+
// Only add a MUST object if we have terms to populate it; empty objects break Elasticsearch
987+
if (!populatedMustNot) {
988+
builder.startArray("must_not")
989+
populatedMustNot = true
966990
}
991+
builder = parseMustNotOperators(builder, key, value, curropr)
967992
}
968993
})
969994
if (populatedMustNot) builder.endArray()

app/views/metadatald/search.scala.html

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,10 @@ <h1>Search Metadata within Space: "@space.name"</h1>
205205
"<option value='=='>equals</option>" +
206206
"<option value='!='>does not equal</option>" +
207207
"<option value=':'>contains</option>" +
208-
"<option value='>'>greater than</option>" +
209-
"<option value='<'>less than</option>" +
208+
//"<option value='>'>greater than</option>" +
209+
//"<option value='<'>less than</option>" +
210+
//"<option value='>='>greater or equal to</option>" +
211+
//"<option value='<='>less or equal to</option>" +
210212
"</select></div>" +
211213
<!-- VALUE FIELD -->
212214
"<div class='form-group col-lg-4 col-md-4'>" +

0 commit comments

Comments
 (0)