Skip to content

Commit 3604217

Browse files
OAK-12053 | oak-search-elastic: set max analyzed offset for highlights (#2681)
* OAK-12053: set max analyzed offset for highlights to avoid query failures for large fields * OAK-12053: minor improvements * OAK-12053: minor improvements * OAK-12053: minor improvements * OAK-12053: minor improvements * OAK-12053: add TODO to improve logic when upgrading to v9.x
1 parent 92ac05f commit 3604217

File tree

2 files changed

+39
-5
lines changed

2 files changed

+39
-5
lines changed

oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@
8282

8383
import javax.jcr.PropertyType;
8484
import java.io.IOException;
85-
import java.io.StringReader;
8685
import java.util.ArrayList;
8786
import java.util.HashMap;
8887
import java.util.List;
@@ -131,6 +130,11 @@ public class ElasticRequestHandler {
131130

132131
private static final String HIGHLIGHT_PREFIX = "<strong>";
133132
private static final String HIGHLIGHT_SUFFIX = "</strong>";
133+
// by default, highlight analyzes up to 1M characters. If the content is larger than that, an error is thrown.
134+
// To avoid that we need to set a limit lower than that.
135+
// TODO: when upgrading to 9.x this value can be set to -1 to implicitly set the limit to index.higihlight.max_analyzed_offset
136+
// https://github.com/elastic/elasticsearch/pull/118895
137+
private static final int HIGHLIGHT_MAX_ANALYZED_OFFSET = 999_999;
134138

135139
// Match Lucene 4.x fuzzy queries (e.g., roam~0.8), but not 5.x and beyond (e.g., roam~2)
136140
private static final Pattern LUCENE_4_FUZZY_PATTERN = Pattern.compile("\\b(\\w+)~([0-9]*\\.?[0-9]+)\\b");
@@ -941,13 +945,18 @@ public Query suggestionMatchQuery(String suggestion) {
941945
* @return a Highlight object representing the excerpts to request or null if none should be requested
942946
*/
943947
public Highlight highlight() {
948+
// if the query does not have a full text constraint, the excerpt makes no sense (it will always be empty)
949+
if (indexPlan.getFilter().getFullTextConstraint() == null) {
950+
return null;
951+
}
952+
944953
Map<String, HighlightField> excerpts = indexPlan.getFilter().getPropertyRestrictions().stream()
945954
.filter(pr -> pr.propertyName.startsWith(QueryConstants.REP_EXCERPT))
946955
.map(this::excerptField)
947956
.distinct()
948957
.collect(Collectors.toMap(
949958
Function.identity(),
950-
field -> HighlightField.of(hf -> hf.withJson(new StringReader("{}"))))
959+
field -> HighlightField.of(hf -> hf))
951960
);
952961

953962
if (excerpts.isEmpty()) {
@@ -958,6 +967,7 @@ public Highlight highlight() {
958967
.preTags(HIGHLIGHT_PREFIX)
959968
.postTags(HIGHLIGHT_SUFFIX)
960969
.fields(excerpts)
970+
.maxAnalyzedOffset(HIGHLIGHT_MAX_ANALYZED_OFFSET)
961971
.numberOfFragments(1)
962972
.requireFieldMatch(false));
963973
}

oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/ExcerptTest.java

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919
package org.apache.jackrabbit.oak.plugins.index;
2020

21+
import org.apache.commons.lang3.RandomStringUtils;
2122
import org.apache.jackrabbit.JcrConstants;
2223
import org.apache.jackrabbit.oak.api.Blob;
2324
import org.apache.jackrabbit.oak.api.PropertyValue;
@@ -29,13 +30,12 @@
2930
import org.apache.jackrabbit.oak.plugins.index.search.IndexFormatVersion;
3031
import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
3132
import org.apache.jackrabbit.oak.query.AbstractQueryTest;
33+
import org.apache.jackrabbit.oak.spi.filter.PathFilter;
3234
import org.junit.Before;
3335
import org.junit.Ignore;
3436
import org.junit.Test;
3537

3638
import java.text.ParseException;
37-
import java.util.ArrayList;
38-
import java.util.Arrays;
3939
import java.util.Iterator;
4040
import java.util.List;
4141
import java.util.stream.Collectors;
@@ -73,6 +73,7 @@ public void setup() throws Exception { //named so that it gets called after supe
7373
def.setProperty(REINDEX_PROPERTY_NAME, true);
7474
def.setProperty(FulltextIndexConstants.EVALUATE_PATH_RESTRICTION, true);
7575
def.setProperty(FulltextIndexConstants.COMPAT_MODE, IndexFormatVersion.V2.getVersion());
76+
def.setProperty(PathFilter.PROP_INCLUDED_PATHS, List.of("/testRoot"), Type.STRINGS);
7677

7778
Tree properties = def.addChild(FulltextIndexConstants.INDEX_RULES)
7879
.addChild("nt:base")
@@ -104,7 +105,7 @@ public void getAllSelectedColumns() throws Exception {
104105
contentRoot.setProperty("baz", "fox ifoxing");
105106
root.commit();
106107

107-
List<String> columns = new ArrayList<>(Arrays.asList("rep:excerpt", "rep:excerpt(.)", "rep:excerpt(foo)", "rep:excerpt(bar)"));
108+
List<String> columns = List.of("rep:excerpt", "rep:excerpt(.)", "rep:excerpt(foo)", "rep:excerpt(bar)");
108109
String selectColumns = columns.stream().map(col -> "[" + col + "]").collect(Collectors.joining(","));
109110
String query = "SELECT " + selectColumns + " FROM [nt:base] WHERE CONTAINS(*, 'fox')";
110111
assertEventually(() -> {
@@ -320,4 +321,27 @@ public void binaryExcerpt() throws Exception {
320321
}
321322
});
322323
}
324+
325+
@Test
326+
public void excerptOnLargeField() throws Exception {
327+
Tree contentRoot = root.getTree("/").addChild("testRoot");
328+
StringBuilder largeContent = new StringBuilder("fox ");
329+
for (int i = 0; i < 1_000_000; i++) {
330+
largeContent.append(RandomStringUtils.insecure().nextAlphabetic(5)).append(" ");
331+
}
332+
largeContent.append(" foxing");
333+
contentRoot.addChild("relative").setProperty("baz", largeContent.toString());
334+
root.commit();
335+
336+
String query = "SELECT [rep:excerpt(.)] FROM [nt:base] WHERE CONTAINS(*, 'fox')";
337+
assertEventually(() -> {
338+
try {
339+
Result result = executeQuery(query, SQL2, NO_BINDINGS);
340+
Iterator<? extends ResultRow> resultIter = result.getRows().iterator();
341+
assertTrue(resultIter.hasNext());
342+
} catch (ParseException e) {
343+
fail(e.getMessage());
344+
}
345+
});
346+
}
323347
}

0 commit comments

Comments
 (0)