@@ -27,6 +27,7 @@ import models.{Collection, Dataset, ElasticsearchResult, File, Folder, ResourceR
2727import play .api .Play .current
2828import play .api .libs .json ._
2929import _root_ .util .SearchUtils
30+ import org .apache .commons .lang .StringUtils
3031import org .elasticsearch .action .admin .indices .exists .indices .IndicesExistsRequest
3132
3233
@@ -48,7 +49,8 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
4849 val nameOfIndex = play.api.Play .configuration.getString(" elasticsearchSettings.indexNamePrefix" ).getOrElse(" clowder" )
4950 val maxResults = play.api.Play .configuration.getInt(" elasticsearchSettings.maxResults" ).getOrElse(240 )
5051
51- val mustOperators = List (" ==" , " <" , " >" , " :" )
52+ // TODO: Removed gt lt gte lte operators until numeric_detection can be enabled on the dynamic mapper
53+ val mustOperators = List (" ==" , " :" ) // "<=", ">=", "<", ">", ":")
5254 val mustNotOperators = List (" !=" )
5355
5456
@@ -130,53 +132,37 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
130132 accumulatePageResult(queryObj, user, from.getOrElse(0 ), size.getOrElse(maxResults))
131133 }
132134
133- /** Search using a simple text string, appending parameters from API to string if provided */
135+ /**
136+ * Search using a simple text string.
137+ * The API endpoint supports several parameters like datasetid that are translated and appended to the query first.
138+ * @param query
139+ * @param resource_type - Restrict to particular resource_type
140+ * @param datasetid - Restrict to particular dataset ID (only returns files)
141+ * @param collectionid - Restrict to particular collection ID
142+ * @param spaceid - Restrict to particular space ID
143+ * @param folderid - Restrict to particular folder ID
144+ * @param field - Restrict to a specific metadata field (assumes query is the value)
145+ * @param tag - Restrict to a particular tag (exact match)
146+ * @param from
147+ * @param size
148+ * @param permitted
149+ * @param user
150+ * @param index
151+ */
134152 def search (query : String , resource_type : Option [String ], datasetid : Option [String ], collectionid : Option [String ],
135153 spaceid : Option [String ], folderid : Option [String ], field : Option [String ], tag : Option [String ],
136154 from : Option [Int ], size : Option [Int ], permitted : List [UUID ], user : Option [User ],
137155 index : String = nameOfIndex): ElasticsearchResult = {
138156
157+ // Convert any parameters from API into the query syntax equivalent so we can parse it all together later
139158 var expanded_query = query
140-
141- // whether to restrict to a particular metadata field, or search all fields (including tags, name, etc.)
142- val mdfield = field match {
143- case Some (k) => expanded_query = " " + k+ " :\" " + expanded_query+ " \" "
144- case None => {}
145- }
146-
147- // Restrict to a particular tag - currently requires exact match
148- tag match {
149- case Some (t) => expanded_query += " tag:" + t
150- case None => {}
151- }
152-
153- // Restrict to particular resource_type if requested
154- resource_type match {
155- case Some (restype) => expanded_query += " resource_type:" + restype
156- case None => {}
157- }
158-
159- // Restrict to particular dataset ID (only return files)
160- datasetid match {
161- case Some (dsid) => expanded_query += " in:" + dsid+ " resource_type:file"
162- case None => {}
163- }
164-
165- // Restrict to particular collection ID
166- collectionid match {
167- case Some (cid) => expanded_query += " in:" + cid
168- case None => {}
169- }
170-
171- spaceid match {
172- case Some (spid) => expanded_query += " in:" + spid
173- case None => {}
174- }
175-
176- folderid match {
177- case Some (fid) => expanded_query += " in:" + fid
178- case None => {}
179- }
159+ field.foreach(k => expanded_query = " " + k+ " :\" " + expanded_query+ " \" " )
160+ tag.foreach(t => expanded_query += s " tag: $t" )
161+ resource_type.foreach(restype => expanded_query += s " resource_type: $restype" )
162+ datasetid.foreach(dsid => expanded_query += s " in: $dsid resource_type:file " )
163+ collectionid.foreach(cid => expanded_query += s " in: $cid" )
164+ spaceid.foreach(spid => expanded_query += s " in: $spid" )
165+ folderid.foreach(fid => expanded_query += s " in: $fid" )
180166
181167 val queryObj = prepareElasticJsonQuery(expanded_query.stripPrefix(" " ), permitted, user)
182168 accumulatePageResult(queryObj, user, from.getOrElse(0 ), size.getOrElse(maxResults))
@@ -291,7 +277,6 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
291277 }).toList, response.getHits().getTotalHits())
292278 }
293279
294-
295280 /** Create a new index with preconfigured mappgin */
296281 def createIndex (index : String = nameOfIndex): Unit = {
297282 val indexSettings = Settings .settingsBuilder().loadFromSource(jsonBuilder()
@@ -706,10 +691,12 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
706691
707692 /** Return string-encoded JSON object describing field types */
708693 def getElasticsearchObjectMappings (): String = {
694+
709695 /** The dynamic template will restrict all dynamic metadata fields to be indexed
710696 * as strings for datatypes besides Objects. In the future, this could
711697 * be removed, but only once the Search API better supports those data types (e.g. Date).
712698 */
699+ // TODO: Enable "numeric_detection": true alongside date_detection
713700 """ {"clowder_object": {
714701 |"date_detection": false,
715702 |"properties": {
@@ -737,17 +724,27 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
737724
738725 /** Create appropriate search object based on operator */
739726 def parseMustOperators (builder : XContentBuilder , key : String , value : String , operator : String ): XContentBuilder = {
740- // TODO: Suppert lte, gte (<=, >=)
741727 operator match {
742728 case " ==" => builder.startObject().startObject(" match_phrase" ).field(key, value).endObject().endObject()
743729 case " <" => builder.startObject().startObject(" range" ).startObject(key).field(" lt" , value).endObject().endObject().endObject()
744730 case " >" => builder.startObject().startObject(" range" ).startObject(key).field(" gt" , value).endObject().endObject().endObject()
731+ case " <=" => builder.startObject().startObject(" range" ).startObject(key).field(" lte" , value).endObject().endObject().endObject()
732+ case " >=" => builder.startObject().startObject(" range" ).startObject(key).field(" gte" , value).endObject().endObject().endObject()
745733 case " :" => {
746734 if (key == " _all" )
747735 builder.startObject().startObject(" regexp" ).field(" _all" , wrapRegex(value)).endObject().endObject()
748- else
736+ else if (key == " exists" ) {
737+ val cleaned = if (! value.startsWith(" metadata." )) " metadata." + value else value
738+ builder.startObject().startObject(" exists" ).field(" field" , cleaned).endObject().endObject()
739+ } else if (key == " missing" ) {
740+ val cleaned = if (! value.startsWith(" metadata." )) " metadata." + value else value
741+ builder.startObject().startObject(" bool" ).startArray(" must_not" ).startObject()
742+ .startObject(" exists" ).field(" field" , cleaned).endObject().endObject().endArray().endObject().endObject()
743+ } else {
744+ val cleaned = value.replace(" :" , " \\ :" ) // Colons have special meaning in query_string
749745 builder.startObject().startObject(" query_string" ).field(" default_field" , key)
750- .field(" query" , " \" " + value+ " \" " ).endObject().endObject()
746+ .field(" query" , cleaned).endObject().endObject()
747+ }
751748 }
752749 case _ => {}
753750 }
@@ -763,7 +760,7 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
763760 builder
764761 }
765762
766- /** Convert list of search term JsValues into an Elasticsearch-ready JSON query object**/
763+ /** Convert list of search term JsValues into an Elasticsearch-ready JSON query object **/
767764 def prepareElasticJsonQuery (query : List [JsValue ], grouping : String ): XContentBuilder = {
768765 /** OPERATORS
769766 * : contains (partial match)
@@ -792,7 +789,7 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
792789 builder.startArray(" should" ).startObject().startObject(" bool" )
793790
794791 // 2) populate the MUST/SHOULD portion
795- if (mustList.length > 0 ) {
792+ if (mustList.length > 0 || mustNotList.length > 0 ) {
796793 grouping match {
797794 case " AND" => builder.startArray(" must" )
798795 case " OR" => builder.startArray(" should" )
@@ -803,6 +800,13 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
803800 val value = (jv \ " field_value" ).toString.replace(" \" " , " " )
804801 builder = parseMustOperators(builder, key, value, operator)
805802 })
803+
804+ // Also add != fields to MUST EXISTS query so we don't return all documents without those fields too
805+ mustNotList.foreach(jv => {
806+ val key = (jv \ " field_key" ).toString.replace(" \" " ," " )
807+ builder.startObject().startObject(" exists" ).field(" field" , key).endObject().endObject()
808+ })
809+
806810 builder.endArray()
807811 }
808812
@@ -838,27 +842,37 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
838842 builder
839843 }
840844
841- /** Convert search string into an Elasticsearch-ready JSON query object**/
845+ /** Convert search string into an Elasticsearch-ready JSON query object **/
842846 def prepareElasticJsonQuery (query : String , permitted : List [UUID ], user : Option [User ]): XContentBuilder = {
843- /** OPERATORS
844- * == equals (exact match)
845- * != not equals (partial matches OK)
846- * < less than
847- * > greater than
848- **/
849847
850848 // Use regex to split string into a list, preserving quoted phrases as single value
851849 val matches = ListBuffer [String ]()
852- val m = Pattern .compile(" ([^\" ]\\ S*|\" .+?\" )\\ s*" ).matcher(query)
850+ val m = Pattern .compile(" ([^\" := ]+|\" .+?\" )" ).matcher(query)
851+ // val m = Pattern.compile("([^\":=<> ]+|\".+?\")").matcher(query)
853852 while (m.find()) {
854- var mat = m.group(1 ).replace(" \" " , " " ).replace(" __" , " " )
855- if (mat.startsWith(" :" )) mat = mat.substring(1 )
856- if (mat.endsWith(" :" )) mat = mat.substring(0 , mat.length- 2 )
857- matches += mat
853+ var mat = m.group(1 ).replace(" \" " , " " ).replace(" __" , " " ).trim
854+ if (mat.length> 0 ) {
855+ // Remove operators from terms e.g. <=value becomes value
856+ (mustOperators ::: mustNotOperators).foreach(op => {
857+ if (mat.startsWith(op)) {
858+ // Make sure x<=4 is "x lte 4" not "x lt =4"
859+ var foundLonger = false
860+ (mustOperators ::: mustNotOperators).foreach(longerop => {
861+ if (longerop!= op && longerop.length> op.length && mat.startsWith(longerop)) {
862+ mat = mat.substring(longerop.length)
863+ foundLonger = true
864+ }
865+ })
866+ if (! foundLonger)
867+ mat = mat.substring(op.length)
868+ }
869+ })
870+ matches += mat
871+ }
858872 }
859873
860874 // If a term is specified that isn't in this list, it's assumed to be a metadata field
861- val official_terms = List (" name" , " creator" , " email" , " resource_type" , " in" , " contains" , " tag" )
875+ val official_terms = List (" name" , " creator" , " email" , " resource_type" , " in" , " contains" , " tag" , " exists " , " missing " )
862876
863877 // Create list of (key, operator, value) for passing to builder
864878 val terms = ListBuffer [(String , String , String )]()
@@ -867,36 +881,44 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
867881 var currval = " "
868882 matches.foreach(mt => {
869883 // Check if the current term appears before or after one of the operators, and what operator is
870- var entryType = " value"
871- if (query.contains(mt+ " :" ) || query.contains(" \" " + mt+ " \" :" )) {
872- entryType = " key"
873- }
884+ var entryType = " unknown"
885+ (mustOperators ::: mustNotOperators).foreach(op => {
886+ val reducedSpaces = StringUtils .normalizeSpace(query)
887+ if ((reducedSpaces.contains(mt+ op) || reducedSpaces.contains(" \" " + mt+ " \" " + op) ||
888+ reducedSpaces.contains(mt+ " " + op) || reducedSpaces.contains(" \" " + mt+ " \" " + op)) && entryType== " unknown" ) {
889+ entryType = " key"
890+ curropr = op
891+ } else if (reducedSpaces.contains(op+ mt) || reducedSpaces.contains(op+ " \" " + mt+ " \" " ) ||
892+ reducedSpaces.contains(op+ " " + mt) || reducedSpaces.contains(op+ " \" " + mt+ " \" " )) {
893+ entryType = " value"
894+ curropr = op
895+ }
896+ })
897+ if (entryType== " unknown" ) entryType = " value"
874898
875899 // Determine if the string was a key or value
876900 if (entryType == " key" ) {
877901 // Do some user-friendly replacement
878- if (mt == " tag" )
879- currkey = " tags"
880- else if (mt == " in" )
881- currkey = " child_of"
882- else if (mt == " contains" )
883- currkey = " parent_of"
884- else if (mt == " creator" )
885- currkey = " creator_name"
886- else if (mt == " email" )
887- currkey = " creator_email"
888- else if (! official_terms.contains(mt))
889- currkey = " metadata." + mt
902+ if (mt == " tag" ) currkey = " tags"
903+ else if (mt == " in" ) currkey = " child_of"
904+ else if (mt == " contains" ) currkey = " parent_of"
905+ else if (mt == " creator" ) currkey = " creator_name"
906+ else if (mt == " email" ) currkey = " creator_email"
907+ else if (! official_terms.contains(mt)) currkey = " metadata." + mt
890908 else
891909 currkey = mt
892910 } else if (entryType == " value" ) {
893- currval += mt.toLowerCase()
911+ if (currkey!= " exists" && currkey!= " missing" )
912+ currval = mt.toLowerCase()
913+ else currval= mt
894914 terms += ((currkey, curropr, currval))
895915 currkey = " _all"
916+ curropr = " :"
896917 currval = " "
897918 }
898919 })
899920
921+ // Now that we have a nicely structured list of (key, operator, value) tuples we can translate to Elastic objects
900922 var builder = jsonBuilder().startObject().startObject(" bool" )
901923
902924 // First, populate the MUST portion of Bool query
@@ -905,16 +927,22 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
905927 val key = entry._1
906928 val curropr = entry._2
907929 val value = entry._3
908- for (operator <- mustOperators) {
909- if (curropr == operator) {
910- // Only add a MUST object if we have terms to populate it; empty objects break Elasticsearch
911- if (mustOperators.contains(operator) && ! populatedMust) {
912- builder.startArray(" must" )
913- populatedMust = true
914- }
930+ if (mustOperators.contains(curropr)) {
931+ // Only add a MUST object if we have terms to populate it; empty objects break Elasticsearch
932+ if (! populatedMust) {
933+ builder.startArray(" must" )
934+ populatedMust = true
935+ }
936+ builder = parseMustOperators(builder, key, value, curropr)
937+ }
915938
916- builder = parseMustOperators(builder, key, value, operator)
939+ // For != operators, include an EXISTS query to avoid returning all documents without that field
940+ if (mustNotOperators.contains(curropr)) {
941+ if (! populatedMust) {
942+ builder.startArray(" must" )
943+ populatedMust = true
917944 }
945+ builder.startObject().startObject(" exists" ).field(" field" , key).endObject().endObject()
918946 }
919947 })
920948
@@ -954,16 +982,13 @@ class ElasticsearchPlugin(application: Application) extends Plugin {
954982 val key = entry._1
955983 val curropr = entry._2
956984 val value = entry._3
957- for (operator <- mustNotOperators) {
958- if (curropr == operator) {
959- // Only add a MUST object if we have terms to populate it; empty objects break Elasticsearch
960- if (mustNotOperators.contains(operator) && ! populatedMustNot) {
961- builder.startArray(" must_not" )
962- populatedMustNot = true
963- }
964-
965- builder = parseMustNotOperators(builder, key, value, operator)
985+ if (mustNotOperators.contains(curropr)) {
986+ // Only add a MUST object if we have terms to populate it; empty objects break Elasticsearch
987+ if (! populatedMustNot) {
988+ builder.startArray(" must_not" )
989+ populatedMustNot = true
966990 }
991+ builder = parseMustNotOperators(builder, key, value, curropr)
967992 }
968993 })
969994 if (populatedMustNot) builder.endArray()
0 commit comments