|
| 1 | +package de.ocrd.olahd.elasticsearch; |
| 2 | + |
| 3 | +import static de.ocrd.olahd.Constants.LOGICAL_INDEX_NAME; |
| 4 | + |
| 5 | +import de.ocrd.olahd.domain.EsNumberQuery; |
| 6 | +import de.ocrd.olahd.domain.SearchTerms; |
| 7 | +import de.ocrd.olahd.utils.Utils; |
| 8 | +import java.util.ArrayList; |
| 9 | +import java.util.HashMap; |
| 10 | +import java.util.HashSet; |
| 11 | +import java.util.List; |
| 12 | +import java.util.Map; |
| 13 | +import java.util.Map.Entry; |
| 14 | +import java.util.Set; |
| 15 | +import org.apache.commons.lang3.StringUtils; |
| 16 | +import org.elasticsearch.action.search.SearchRequest; |
| 17 | +import org.elasticsearch.index.query.BoolQueryBuilder; |
| 18 | +import org.elasticsearch.index.query.QueryBuilder; |
| 19 | +import org.elasticsearch.index.query.QueryBuilders; |
| 20 | +import org.elasticsearch.search.aggregations.AggregationBuilders; |
| 21 | +import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; |
| 22 | +import org.elasticsearch.search.builder.SearchSourceBuilder; |
| 23 | + |
| 24 | +/** |
| 25 | + * Class to create the facet-search-query. Groups together steps to create the Elasticsearch Query to do the facet |
| 26 | + * search |
| 27 | + */ |
| 28 | +public class ElasticQueryHelperV2 { |
| 29 | + |
| 30 | + /** Name of the aggregation containing the search hits */ |
| 31 | + public static final String HITS_AGG = "group-by-pid"; |
| 32 | + /** Name of the sub-aggregation containing the pids per facet */ |
| 33 | + public static final String SUB_AGG_PIDS = "pids-per-facet"; |
| 34 | + /** Max size of pids-per-facet aggregation */ |
| 35 | + public static final int MAX_PID_PER_FACET = 100; |
| 36 | + public static final String COUNTER_AGG = "counter"; |
| 37 | + |
| 38 | + /** Fields which are fetched from source */ |
| 39 | + private static final String[] SOURCE_FIELDS = new String[] { "pid", "publish_infos", "title", "doctype", "IsGt", |
| 40 | + "creator_infos", "structrun", "filegrp_use_types" |
| 41 | + }; |
| 42 | + |
| 43 | + private final static String F_CREATOR = "Creators"; |
| 44 | + private final static String F_TITLE = "Titles"; |
| 45 | + private final static String F_PLACE = "Place"; |
| 46 | + private final static String F_YEAR = "Publish Year"; |
| 47 | + private final static String F_FGRP = "File Groups"; |
| 48 | + |
| 49 | + /** |
| 50 | + * Mapping from filter-name to corresponding column |
| 51 | + * |
| 52 | + * Filters are named "Creators", "Titles" or "Publishers" etc. This function returns the corresponding column from |
| 53 | + * the Elasticsearch-entry. For example for Filter Creator, the column to filter must be creator_infos.name.keyword |
| 54 | + */ |
| 55 | + public static final Map<String, String> FILTER_MAP = Map.of( |
| 56 | + F_CREATOR, "creator_infos.name.keyword", |
| 57 | + F_TITLE, "title.title.keyword", |
| 58 | + F_PLACE, "publish_infos.place_publish.keyword", |
| 59 | + F_YEAR, "publish_infos.year_publish", |
| 60 | + F_FGRP, "filegrp_use_types.keyword" |
| 61 | + ); |
| 62 | + |
| 63 | + private SearchTerms searchterms; |
| 64 | + private int limit; |
| 65 | + private int offset; |
| 66 | + private boolean extended; |
| 67 | + private Boolean isGt; |
| 68 | + private boolean metadatasearch; |
| 69 | + private boolean fulltextsearch; |
| 70 | + private String sort; |
| 71 | + private String[] field; |
| 72 | + private String[] value; |
| 73 | + private Set<String> fulltextPids; |
| 74 | + |
| 75 | + public ElasticQueryHelperV2( |
| 76 | + SearchTerms searchterm, int limit, int offset, boolean extended, Boolean isGt, |
| 77 | + boolean metadatasearch, boolean fulltextsearch, String sort, String[] field, String[] value, |
| 78 | + Set<String> fulltextPids |
| 79 | + ) { |
| 80 | + super(); |
| 81 | + this.searchterms = searchterm; |
| 82 | + this.limit = limit; |
| 83 | + this.offset = offset; |
| 84 | + this.extended = extended; |
| 85 | + this.isGt = isGt; |
| 86 | + this.metadatasearch = metadatasearch; |
| 87 | + this.fulltextsearch = fulltextsearch; |
| 88 | + this.sort = sort; |
| 89 | + this.field = field; |
| 90 | + this.value = value; |
| 91 | + this.fulltextPids = fulltextPids; |
| 92 | + } |
| 93 | + |
| 94 | + /** |
| 95 | + * Create the "searchSource". This is the search-Document elasticsearch executes |
| 96 | + * |
| 97 | + * The search consists of four parts: - the part of the query responsible for matching the documents |
| 98 | + * (query.bool.must) - the part of the query for filtering the results (query.bool.filter) - the aggregation used to |
| 99 | + * group the search hits (aggregations.group-by-pid) - the aggregations for collecting the facets |
| 100 | + * (aggregations.Titles, aggregations.Creators ...) |
| 101 | + * |
| 102 | + * @return |
| 103 | + */ |
| 104 | + public SearchRequest createSearchRequest() { |
| 105 | + SearchRequest res = new SearchRequest().indices(LOGICAL_INDEX_NAME); |
| 106 | + SearchSourceBuilder source = new SearchSourceBuilder(); |
| 107 | + res.source(source); |
| 108 | + |
| 109 | + // part 1: matching |
| 110 | + BoolQueryBuilder query = this.createQuery(); |
| 111 | + // part 2: filters |
| 112 | + // TODO: according to API do not use filters if 'extended' is specified. could be added here |
| 113 | + this.addFacetFilters(query); |
| 114 | + // part 4: aggregations for collecting the facets |
| 115 | + List<TermsAggregationBuilder> aggFacets = this.createFacetAggregations(); |
| 116 | + |
| 117 | + // putting things together |
| 118 | + source.query(query); |
| 119 | + source.from(offset); |
| 120 | + source.size(limit); |
| 121 | + |
| 122 | + for (TermsAggregationBuilder agg : aggFacets) { |
| 123 | + source.aggregation(agg); |
| 124 | + } |
| 125 | + return res; |
| 126 | + } |
| 127 | + |
| 128 | + /** |
| 129 | + * Add filters corresponding to the facets selected by the user |
| 130 | + * |
| 131 | + * @param query |
| 132 | + */ |
| 133 | + private void addFacetFilters(BoolQueryBuilder query) { |
| 134 | + if (field != null && field.length > 0) { |
| 135 | + Map<String, List<String>> filters = new HashMap<>(); |
| 136 | + for (int i = 0; i < field.length; i++) { |
| 137 | + String fieldName = FILTER_MAP.getOrDefault(field[i], field[i]); |
| 138 | + filters.putIfAbsent(fieldName, new ArrayList<>()); |
| 139 | + filters.get(fieldName).add(value[i]); |
| 140 | + } |
| 141 | + BoolQueryBuilder boolMust = QueryBuilders.boolQuery(); |
| 142 | + for (Entry<String, List<String>> entry : filters.entrySet()) { |
| 143 | + BoolQueryBuilder boolShould = QueryBuilders.boolQuery(); |
| 144 | + for (String filterValue : entry.getValue()) { |
| 145 | + boolShould.should(QueryBuilders.termQuery(entry.getKey(), filterValue)); |
| 146 | + } |
| 147 | + boolMust.must(boolShould); |
| 148 | + } |
| 149 | + query.filter(boolMust); |
| 150 | + } |
| 151 | + } |
| 152 | + |
| 153 | + private BoolQueryBuilder createQuery() { |
| 154 | + BoolQueryBuilder res = QueryBuilders.boolQuery(); |
| 155 | + String searchterm = searchterms.getSearchterm(); |
| 156 | + res = res.must(QueryBuilders.matchQuery("IsFirst", true)); |
| 157 | + if (StringUtils.isNotBlank(searchterm)) { |
| 158 | + if (metadatasearch && fulltextsearch) { |
| 159 | + BoolQueryBuilder boolMust = QueryBuilders.boolQuery(); |
| 160 | + BoolQueryBuilder boolShould = QueryBuilders.boolQuery(); |
| 161 | + boolShould.should(addMatchOrQstr("metadata", searchterm)); |
| 162 | + boolShould.should( |
| 163 | + QueryBuilders.termsQuery("pid.keyword", fulltextPids != null ? fulltextPids : new HashSet<>()) |
| 164 | + ); |
| 165 | + res = res.must(boolMust.must(boolShould)); |
| 166 | + } else if (fulltextsearch) { |
| 167 | + res = res.must( |
| 168 | + QueryBuilders.termsQuery("pid.keyword", fulltextPids != null ? fulltextPids : new HashSet<>()) |
| 169 | + ); |
| 170 | + } else { |
| 171 | + res = res.must(addMatchOrQstr("metadata", searchterm)); |
| 172 | + } |
| 173 | + } |
| 174 | + if (Boolean.TRUE.equals(this.isGt)) { |
| 175 | + res = res.must(QueryBuilders.matchQuery("IsGt", true)); |
| 176 | + } |
| 177 | + // This sets filters for the "advanced search" |
| 178 | + if (searchterms.hasFilter()) { |
| 179 | + if (StringUtils.isNotBlank(searchterms.getAuthor())) { |
| 180 | + res = res.filter( |
| 181 | + addMatchOrQstr("creator_infos.name", searchterms.getAuthor()) |
| 182 | + ); |
| 183 | + } |
| 184 | + if (StringUtils.isNotBlank(searchterms.getTitle())) { |
| 185 | + res = res.filter(addMatchOrQstr("title.title", searchterms.getTitle())); |
| 186 | + } |
| 187 | + if (StringUtils.isNotBlank(searchterms.getPlace())) { |
| 188 | + res = res.filter( |
| 189 | + addMatchOrQstr("publish_infos.place_publish", searchterms.getPlace()) |
| 190 | + ); |
| 191 | + } |
| 192 | + if (StringUtils.isNotBlank(searchterms.getYear())) { |
| 193 | + QueryBuilder numberQuery = createYearQuery(searchterms.getYear()); |
| 194 | + if (numberQuery != null) { |
| 195 | + res = res.filter(numberQuery); |
| 196 | + } else { |
| 197 | + Utils.logInfo("Search for year cannot be used: '" + searchterms.getYear() + "'"); |
| 198 | + } |
| 199 | + } |
| 200 | + } |
| 201 | + return res; |
| 202 | + } |
| 203 | + |
| 204 | + /** |
| 205 | + * Add a match- or a query-string-query |
| 206 | + * |
| 207 | + * This method checks weather the searchterm contains one or more asterisks and then either creates a match query or |
| 208 | + * a query_string_query. |
| 209 | + * |
| 210 | + * Previously we only had a match query which hits on complete word matches. Later we wanted to add a wildcard |
| 211 | + * search. This creates one of it. I wanted to keep the match query for when the asterisk is not used, because |
| 212 | + * I think it is faster. |
| 213 | + * |
| 214 | + * @param string |
| 215 | + * @param searchterm |
| 216 | + * @return |
| 217 | + */ |
| 218 | + private QueryBuilder addMatchOrQstr(String fieldname, String searchterm) { |
| 219 | + if (searchterm.indexOf('*') > -1) { |
| 220 | + return QueryBuilders.queryStringQuery(searchterm).field(fieldname); |
| 221 | + } else { |
| 222 | + return QueryBuilders.matchQuery(fieldname, searchterm); |
| 223 | + } |
| 224 | + } |
| 225 | + |
| 226 | + private QueryBuilder createYearQuery(String numberStr) { |
| 227 | + if (numberStr.indexOf('*') > -1) { |
| 228 | + // asterisk and range cannot work together |
| 229 | + if (numberStr.indexOf('>') > -1 || numberStr.indexOf('<') > -1) { |
| 230 | + return null; |
| 231 | + } |
| 232 | + return QueryBuilders.queryStringQuery(numberStr).field("publish_infos.year_publish_string"); |
| 233 | + } |
| 234 | + EsNumberQuery x = EsNumberQuery.fromQueryString(numberStr); |
| 235 | + if (x == null) { |
| 236 | + return null; |
| 237 | + } else if (x.cmp == EsNumberQuery.Cmp.EQ) { |
| 238 | + return QueryBuilders.matchQuery("publish_infos.year_publish", numberStr); |
| 239 | + } |
| 240 | + |
| 241 | + var rangeQuery = QueryBuilders.rangeQuery("publish_infos.year_publish"); |
| 242 | + switch (x.cmp) { |
| 243 | + case GT: |
| 244 | + return rangeQuery.gt(x.value1); |
| 245 | + case LT: |
| 246 | + return rangeQuery.lt(x.value1); |
| 247 | + case GTE: |
| 248 | + return rangeQuery.gte(x.value1); |
| 249 | + case LTE: |
| 250 | + return rangeQuery.lte(x.value1); |
| 251 | + case RANGE: |
| 252 | + return rangeQuery.gte(x.value1).lte(x.value2); |
| 253 | + default: |
| 254 | + // This cannot happen, except the enum was extended |
| 255 | + throw new AssertionError("Unexpected switch default: createYearQuery"); |
| 256 | + } |
| 257 | + } |
| 258 | + |
| 259 | + private List<TermsAggregationBuilder> createFacetAggregations() { |
| 260 | + List<TermsAggregationBuilder> res = new ArrayList<>(); |
| 261 | + // Facets |
| 262 | + res.add(createSingleFacetAggregation(F_TITLE, FILTER_MAP.get(F_TITLE))); |
| 263 | + res.add(createSingleFacetAggregation(F_CREATOR, FILTER_MAP.get(F_CREATOR))); |
| 264 | + res.add(createSingleFacetAggregation(F_PLACE, FILTER_MAP.get(F_PLACE))); |
| 265 | + res.add(createSingleFacetAggregation(F_YEAR, FILTER_MAP.get(F_YEAR))); |
| 266 | + res.add(createSingleFacetAggregation(F_FGRP, FILTER_MAP.get(F_FGRP))); |
| 267 | + return res; |
| 268 | + } |
| 269 | + |
| 270 | + private TermsAggregationBuilder createSingleFacetAggregation(String term, String field) { |
| 271 | + return AggregationBuilders.terms(term).field(field); |
| 272 | + } |
| 273 | +} |
0 commit comments