Skip to content

Commit 9ca7e6a

Browse files
committed
Simplify and improve the facet-search
Currently the olad search can still be used, so that it is possible to compare the new to the old version. To remove it ElasticQueryHelper.java should be deleted, every not used Method from ElasticResponseHelper.java and ElasticSearchService.facetSearch() The old version searches for the searchterm in all documents of all indices. This way duplicates for a single pid/workspace is possible. Therefore the results are grouped by pid. This can lead to problems regarding facets because not all documents have all the information. More details in Jira: OLAHDS-445. The new version only searches in log-entries having isFirst = true so that only one hit per pid/workspace is possible. This simplifies the search but not all information is searchable any more because not all entries are queried any more. It is a trade off, but the results are better especially the facets. Also this way it would be possible to implement sorting and I think this must be faster because less aggregations are used (not verified though).
1 parent 978f372 commit 9ca7e6a

File tree

5 files changed

+440
-5
lines changed

5 files changed

+440
-5
lines changed

docker-compose.localdev.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ services:
3434
- ./cfg/localdev/commons.env:/etc/env/commons.env
3535
- ./cfg/localdev/commons-static.env:/etc/env/commons-static.env
3636
- ./cfg/localdev/iiif.env:/etc/env/iiif.env
37+
ports:
38+
- "1323:1323"
3739
networks:
3840
olahd:
3941
aliases:

src/main/java/de/ocrd/olahd/controller/SearchController.java

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,9 @@ public ResponseEntity<?> search(
206206
@RequestParam(required = false) @ApiParam(value = "Place filter", required = false)
207207
String place,
208208
@RequestParam(required = false) @ApiParam(value = "Year filter", required = false)
209-
String year
209+
String year,
210+
@RequestParam(defaultValue = "2") @ApiParam(value = "Currently there are 2 different search versions, old = 1 and new one = 2", required = false)
211+
int searchVersion
210212
) throws IOException {
211213
if (field != null) {
212214
if (value == null || field.length != value.length) {
@@ -250,13 +252,22 @@ public ResponseEntity<?> search(
250252
return ResponseEntity.ok(detail);
251253
} else {
252254
SearchTerms searchterms = new SearchTerms(searchterm, author, title, place, year);
253-
ResultSet resultSet = elasticsearchService.facetSearch(
254-
searchterms, limit, offset, false, isGT, metadatasearch, fulltextsearch, null,
255-
field, value
256-
);
255+
ResultSet resultSet = null;
256+
if (searchVersion == 1) {
257+
// This code is for the "old" grouped search in case it is needed again
258+
resultSet = elasticsearchService.facetSearch(
259+
searchterms, limit, offset, false, isGT, metadatasearch, fulltextsearch, null, field, value
260+
);
261+
} else {
262+
resultSet = elasticsearchService.facetSearchV2(
263+
searchterms, limit, offset, false, isGT, metadatasearch, fulltextsearch, null, field, value
264+
);
265+
266+
}
257267
return ResponseEntity.ok(resultSet);
258268
}
259269
}
270+
260271
@ApiOperation(value = "Returns the latest PID for an Ocrd-Identifier")
261272
@ApiResponses({ @ApiResponse(code = 200, message = "PID for Ocrd-Identifier found", response = String.class),
262273
@ApiResponse(code = 404, message = "Ocrd-Identifier not found", response = String.class)
Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
package de.ocrd.olahd.elasticsearch;
2+
3+
import static de.ocrd.olahd.Constants.LOGICAL_INDEX_NAME;
4+
5+
import de.ocrd.olahd.domain.EsNumberQuery;
6+
import de.ocrd.olahd.domain.SearchTerms;
7+
import de.ocrd.olahd.utils.Utils;
8+
import java.util.ArrayList;
9+
import java.util.HashMap;
10+
import java.util.HashSet;
11+
import java.util.List;
12+
import java.util.Map;
13+
import java.util.Map.Entry;
14+
import java.util.Set;
15+
import org.apache.commons.lang3.StringUtils;
16+
import org.elasticsearch.action.search.SearchRequest;
17+
import org.elasticsearch.index.query.BoolQueryBuilder;
18+
import org.elasticsearch.index.query.QueryBuilder;
19+
import org.elasticsearch.index.query.QueryBuilders;
20+
import org.elasticsearch.search.aggregations.AggregationBuilders;
21+
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
22+
import org.elasticsearch.search.builder.SearchSourceBuilder;
23+
24+
/**
25+
* Class to create the facet-search-query. Groups together steps to create the Elasticsearch Query to do the facet
26+
* search
27+
*/
28+
public class ElasticQueryHelperV2 {
29+
30+
/** Name of the aggregation containing the search hits */
31+
public static final String HITS_AGG = "group-by-pid";
32+
/** Name of the sub-aggregation containing the pids per facet */
33+
public static final String SUB_AGG_PIDS = "pids-per-facet";
34+
/** Max size of pids-per-facet aggregation */
35+
public static final int MAX_PID_PER_FACET = 100;
36+
public static final String COUNTER_AGG = "counter";
37+
38+
/** Fields which are fetched from source */
39+
private static final String[] SOURCE_FIELDS = new String[] { "pid", "publish_infos", "title", "doctype", "IsGt",
40+
"creator_infos", "structrun", "filegrp_use_types"
41+
};
42+
43+
private final static String F_CREATOR = "Creators";
44+
private final static String F_TITLE = "Titles";
45+
private final static String F_PLACE = "Place";
46+
private final static String F_YEAR = "Publish Year";
47+
private final static String F_FGRP = "File Groups";
48+
49+
/**
50+
* Mapping from filter-name to corresponding column
51+
*
52+
* Filters are named "Creators", "Titles" or "Publishers" etc. This function returns the corresponding column from
53+
* the Elasticsearch-entry. For example for Filter Creator, the column to filter must be creator_infos.name.keyword
54+
*/
55+
public static final Map<String, String> FILTER_MAP = Map.of(
56+
F_CREATOR, "creator_infos.name.keyword",
57+
F_TITLE, "title.title.keyword",
58+
F_PLACE, "publish_infos.place_publish.keyword",
59+
F_YEAR, "publish_infos.year_publish",
60+
F_FGRP, "filegrp_use_types.keyword"
61+
);
62+
63+
private SearchTerms searchterms;
64+
private int limit;
65+
private int offset;
66+
private boolean extended;
67+
private Boolean isGt;
68+
private boolean metadatasearch;
69+
private boolean fulltextsearch;
70+
private String sort;
71+
private String[] field;
72+
private String[] value;
73+
private Set<String> fulltextPids;
74+
75+
public ElasticQueryHelperV2(
76+
SearchTerms searchterm, int limit, int offset, boolean extended, Boolean isGt,
77+
boolean metadatasearch, boolean fulltextsearch, String sort, String[] field, String[] value,
78+
Set<String> fulltextPids
79+
) {
80+
super();
81+
this.searchterms = searchterm;
82+
this.limit = limit;
83+
this.offset = offset;
84+
this.extended = extended;
85+
this.isGt = isGt;
86+
this.metadatasearch = metadatasearch;
87+
this.fulltextsearch = fulltextsearch;
88+
this.sort = sort;
89+
this.field = field;
90+
this.value = value;
91+
this.fulltextPids = fulltextPids;
92+
}
93+
94+
/**
95+
* Create the "searchSource". This is the search-Document elasticsearch executes
96+
*
97+
* The search consists of four parts: - the part of the query responsible for matching the documents
98+
* (query.bool.must) - the part of the query for filtering the results (query.bool.filter) - the aggregation used to
99+
* group the search hits (aggregations.group-by-pid) - the aggregations for collecting the facets
100+
* (aggregations.Titles, aggregations.Creators ...)
101+
*
102+
* @return
103+
*/
104+
public SearchRequest createSearchRequest() {
105+
SearchRequest res = new SearchRequest().indices(LOGICAL_INDEX_NAME);
106+
SearchSourceBuilder source = new SearchSourceBuilder();
107+
res.source(source);
108+
109+
// part 1: matching
110+
BoolQueryBuilder query = this.createQuery();
111+
// part 2: filters
112+
// TODO: according to API do not use filters if 'extended' is specified. could be added here
113+
this.addFacetFilters(query);
114+
// part 4: aggregations for collecting the facets
115+
List<TermsAggregationBuilder> aggFacets = this.createFacetAggregations();
116+
117+
// putting things together
118+
source.query(query);
119+
source.from(offset);
120+
source.size(limit);
121+
122+
for (TermsAggregationBuilder agg : aggFacets) {
123+
source.aggregation(agg);
124+
}
125+
return res;
126+
}
127+
128+
/**
129+
* Add filters corresponding to the facets selected by the user
130+
*
131+
* @param query
132+
*/
133+
private void addFacetFilters(BoolQueryBuilder query) {
134+
if (field != null && field.length > 0) {
135+
Map<String, List<String>> filters = new HashMap<>();
136+
for (int i = 0; i < field.length; i++) {
137+
String fieldName = FILTER_MAP.getOrDefault(field[i], field[i]);
138+
filters.putIfAbsent(fieldName, new ArrayList<>());
139+
filters.get(fieldName).add(value[i]);
140+
}
141+
BoolQueryBuilder boolMust = QueryBuilders.boolQuery();
142+
for (Entry<String, List<String>> entry : filters.entrySet()) {
143+
BoolQueryBuilder boolShould = QueryBuilders.boolQuery();
144+
for (String filterValue : entry.getValue()) {
145+
boolShould.should(QueryBuilders.termQuery(entry.getKey(), filterValue));
146+
}
147+
boolMust.must(boolShould);
148+
}
149+
query.filter(boolMust);
150+
}
151+
}
152+
153+
private BoolQueryBuilder createQuery() {
154+
BoolQueryBuilder res = QueryBuilders.boolQuery();
155+
String searchterm = searchterms.getSearchterm();
156+
res = res.must(QueryBuilders.matchQuery("IsFirst", true));
157+
if (StringUtils.isNotBlank(searchterm)) {
158+
if (metadatasearch && fulltextsearch) {
159+
BoolQueryBuilder boolMust = QueryBuilders.boolQuery();
160+
BoolQueryBuilder boolShould = QueryBuilders.boolQuery();
161+
boolShould.should(addMatchOrQstr("metadata", searchterm));
162+
boolShould.should(
163+
QueryBuilders.termsQuery("pid.keyword", fulltextPids != null ? fulltextPids : new HashSet<>())
164+
);
165+
res = res.must(boolMust.must(boolShould));
166+
} else if (fulltextsearch) {
167+
res = res.must(
168+
QueryBuilders.termsQuery("pid.keyword", fulltextPids != null ? fulltextPids : new HashSet<>())
169+
);
170+
} else {
171+
res = res.must(addMatchOrQstr("metadata", searchterm));
172+
}
173+
}
174+
if (Boolean.TRUE.equals(this.isGt)) {
175+
res = res.must(QueryBuilders.matchQuery("IsGt", true));
176+
}
177+
// This sets filters for the "advanced search"
178+
if (searchterms.hasFilter()) {
179+
if (StringUtils.isNotBlank(searchterms.getAuthor())) {
180+
res = res.filter(
181+
addMatchOrQstr("creator_infos.name", searchterms.getAuthor())
182+
);
183+
}
184+
if (StringUtils.isNotBlank(searchterms.getTitle())) {
185+
res = res.filter(addMatchOrQstr("title.title", searchterms.getTitle()));
186+
}
187+
if (StringUtils.isNotBlank(searchterms.getPlace())) {
188+
res = res.filter(
189+
addMatchOrQstr("publish_infos.place_publish", searchterms.getPlace())
190+
);
191+
}
192+
if (StringUtils.isNotBlank(searchterms.getYear())) {
193+
QueryBuilder numberQuery = createYearQuery(searchterms.getYear());
194+
if (numberQuery != null) {
195+
res = res.filter(numberQuery);
196+
} else {
197+
Utils.logInfo("Search for year cannot be used: '" + searchterms.getYear() + "'");
198+
}
199+
}
200+
}
201+
return res;
202+
}
203+
204+
/**
205+
* Add a match- or a query-string-query
206+
*
207+
* This method checks weather the searchterm contains one or more asterisks and then either creates a match query or
208+
* a query_string_query.
209+
*
210+
* Previously we only had a match query which hits on complete word matches. Later we wanted to add a wildcard
211+
* search. This creates one of it. I wanted to keep the match query for when the asterisk is not used, because
212+
* I think it is faster.
213+
*
214+
* @param string
215+
* @param searchterm
216+
* @return
217+
*/
218+
private QueryBuilder addMatchOrQstr(String fieldname, String searchterm) {
219+
if (searchterm.indexOf('*') > -1) {
220+
return QueryBuilders.queryStringQuery(searchterm).field(fieldname);
221+
} else {
222+
return QueryBuilders.matchQuery(fieldname, searchterm);
223+
}
224+
}
225+
226+
private QueryBuilder createYearQuery(String numberStr) {
227+
if (numberStr.indexOf('*') > -1) {
228+
// asterisk and range cannot work together
229+
if (numberStr.indexOf('>') > -1 || numberStr.indexOf('<') > -1) {
230+
return null;
231+
}
232+
return QueryBuilders.queryStringQuery(numberStr).field("publish_infos.year_publish_string");
233+
}
234+
EsNumberQuery x = EsNumberQuery.fromQueryString(numberStr);
235+
if (x == null) {
236+
return null;
237+
} else if (x.cmp == EsNumberQuery.Cmp.EQ) {
238+
return QueryBuilders.matchQuery("publish_infos.year_publish", numberStr);
239+
}
240+
241+
var rangeQuery = QueryBuilders.rangeQuery("publish_infos.year_publish");
242+
switch (x.cmp) {
243+
case GT:
244+
return rangeQuery.gt(x.value1);
245+
case LT:
246+
return rangeQuery.lt(x.value1);
247+
case GTE:
248+
return rangeQuery.gte(x.value1);
249+
case LTE:
250+
return rangeQuery.lte(x.value1);
251+
case RANGE:
252+
return rangeQuery.gte(x.value1).lte(x.value2);
253+
default:
254+
// This cannot happen, except the enum was extended
255+
throw new AssertionError("Unexpected switch default: createYearQuery");
256+
}
257+
}
258+
259+
private List<TermsAggregationBuilder> createFacetAggregations() {
260+
List<TermsAggregationBuilder> res = new ArrayList<>();
261+
// Facets
262+
res.add(createSingleFacetAggregation(F_TITLE, FILTER_MAP.get(F_TITLE)));
263+
res.add(createSingleFacetAggregation(F_CREATOR, FILTER_MAP.get(F_CREATOR)));
264+
res.add(createSingleFacetAggregation(F_PLACE, FILTER_MAP.get(F_PLACE)));
265+
res.add(createSingleFacetAggregation(F_YEAR, FILTER_MAP.get(F_YEAR)));
266+
res.add(createSingleFacetAggregation(F_FGRP, FILTER_MAP.get(F_FGRP)));
267+
return res;
268+
}
269+
270+
private TermsAggregationBuilder createSingleFacetAggregation(String term, String field) {
271+
return AggregationBuilders.terms(term).field(field);
272+
}
273+
}

src/main/java/de/ocrd/olahd/elasticsearch/ElasticResponseHelper.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import java.util.stream.Collectors;
1414
import org.apache.commons.lang3.StringUtils;
1515
import org.elasticsearch.action.search.SearchResponse;
16+
import org.elasticsearch.search.SearchHit;
1617
import org.elasticsearch.search.aggregations.Aggregation;
1718
import org.elasticsearch.search.aggregations.Aggregations;
1819
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
@@ -82,6 +83,56 @@ public Detail fillSearchHitIntoDetail(Map<String, Object> hit) {
8283
return res;
8384
}
8485

86+
/**
87+
* Extract the results from the response and fill it into the response model.
88+
*
89+
* This method is for the simplified search query
90+
*
91+
* @return
92+
*/
93+
public ResultSet responseToResultSetV2(
94+
SearchResponse response, SearchTerms searchterms, boolean metadatasearch,
95+
boolean fulltextsearch, int offset, int limit
96+
) {
97+
98+
ResultSet res = new ResultSet();
99+
List<HitList> hitlist = new ArrayList<>();
100+
res.setHitlist(hitlist);
101+
102+
for (SearchHit hit : response.getHits()) {
103+
Map<String, Object> hitmap = hit.getSourceAsMap();
104+
HitList hitResult = new HitList();
105+
hitlist.add(hitResult);
106+
hitResult.setPid(hitmap.get("pid").toString());
107+
hitResult.setTitle(readTitleFromSearchHit(hitmap));
108+
hitResult.setSubtitle(readSubtitleFromSearchHit(hitmap));
109+
hitResult.setPlaceOfPublish(readPlaceOfPublishFromSearchHit(hitmap));
110+
hitResult.setYearOfPublish(readYearFromSearchHit(hitmap));
111+
hitResult.setPublisher(readPublisherFromSearchHit(hitmap));
112+
hitResult.setCreator(readCreatorFromSearchHit(hitmap));
113+
hitResult.setGt(readIsGtFromSearchHit(hitmap));
114+
}
115+
116+
List<Facets> facets = new ArrayList<>();
117+
for (Aggregation agg : response.getAggregations().asList()) {
118+
Terms terms = (Terms) agg;
119+
List<Values> values = new ArrayList<>();
120+
for (Bucket bucket : terms.getBuckets()) {
121+
Values val = new Values(bucket.getKeyAsString(), (int)bucket.getDocCount(), false);
122+
values.add(val);
123+
}
124+
facets.add(new Facets(terms.getName(), values));
125+
}
126+
res.setFacets(facets);
127+
res.setSearchTerms(searchterms);
128+
res.setMetadataSearch(metadatasearch);
129+
res.setFulltextSearch(fulltextsearch);
130+
res.setOffset(offset);
131+
res.setLimit(limit);
132+
133+
return res;
134+
}
135+
85136
/**
86137
* Convert aggregations with the hits to ResultSet as specified by the API
87138
*

0 commit comments

Comments
 (0)