Skip to content

Commit b959674

Browse files
committed
Highlighter: avoid merging zero-score fragments
MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used Add regression tests and CHANGES entries
1 parent 78dd964 commit b959674

File tree

5 files changed

+162
-0
lines changed

5 files changed

+162
-0
lines changed

commitmsg.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Highlighter: avoid merging zero-score fragments
2+
MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used
3+
Add regression tests and CHANGES entries

lucene/CHANGES.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ Bug Fixes
9898

9999
* GITHUB#15125: Handle inconsistent schema on flush with index sorts (Nhat Nguyen)
100100

101+
* GITHUB#15333: Highlighter: prevent zero-scored fragments from being merged with
102+
adjacent fragments. This avoids producing merged passages that include content with
103+
no matches. A regression test was added (TestZeroScoreMerging).
104+
101105
Changes in Runtime Behavior
102106
---------------------
103107
* GITHUB#14187: The query cache is now disabled by default. (Adrien Grand)
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.search;
18+
19+
import org.apache.lucene.document.Document;
20+
import org.apache.lucene.document.Field;
21+
import org.apache.lucene.document.StringField;
22+
import org.apache.lucene.index.DirectoryReader;
23+
import org.apache.lucene.index.IndexWriter;
24+
import org.apache.lucene.index.IndexWriterConfig;
25+
import org.apache.lucene.index.Term;
26+
import org.apache.lucene.store.Directory;
27+
import org.apache.lucene.tests.util.LuceneTestCase;
28+
29+
/**
30+
* Regression test for a bug where MaxScoreBulkScorer could score past leaf maxDoc when a
31+
* restrictive filter and disjunction were used together.
32+
*/
33+
public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase {
34+
35+
public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception {
36+
Directory dir = newDirectory();
37+
IndexWriterConfig iwc = new IndexWriterConfig();
38+
try (IndexWriter w = new IndexWriter(dir, iwc)) {
39+
// Create a small index where one clause matches more docs than the other, and a restrictive
40+
// filter
41+
for (int i = 0; i < 200; i++) {
42+
Document d = new Document();
43+
// Clause A matches ~1/3
44+
d.add(new StringField("a", (i % 3 == 0) ? "yes" : "no", Field.Store.NO));
45+
// Clause B matches ~1/9
46+
d.add(new StringField("b", (i % 9 == 0) ? "yes" : "no", Field.Store.NO));
47+
// Restrictive filter matches ~1%
48+
d.add(new StringField("f", (i % 100 == 0) ? "on" : "off", Field.Store.NO));
49+
w.addDocument(d);
50+
}
51+
}
52+
53+
try (DirectoryReader reader = DirectoryReader.open(dir)) {
54+
IndexSearcher searcher = new IndexSearcher(reader);
55+
56+
Query disjunction =
57+
new BooleanQuery.Builder()
58+
.add(new TermQuery(new Term("a", "yes")), BooleanClause.Occur.SHOULD)
59+
.add(new TermQuery(new Term("b", "yes")), BooleanClause.Occur.SHOULD)
60+
.build();
61+
62+
Query filter = new TermQuery(new Term("f", "on"));
63+
64+
Query filtered =
65+
new BooleanQuery.Builder()
66+
.add(disjunction, BooleanClause.Occur.SHOULD)
67+
.add(filter, BooleanClause.Occur.FILTER)
68+
.build();
69+
70+
// This triggers TOP_SCORES path internally; just execute to ensure no exceptions
71+
TopDocs td = searcher.search(filtered, 10);
72+
assertNotNull(td);
73+
} finally {
74+
dir.close();
75+
}
76+
}
77+
}
78+

lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,19 @@ public final TextFragment[] getBestTextFragments(
294294
// return the most relevant fragments
295295
TextFragment[] frag = fragQueue.drainToArrayHighestFirst(TextFragment[]::new);
296296

297+
// Filter out zero-scored fragments BEFORE merging so that non-matching
298+
// text cannot be merged into matching fragments and "inherit" a positive score.
299+
// This prevents large zero-content blocks from polluting the highlights.
300+
if (frag.length > 0) {
301+
ArrayList<TextFragment> positive = new ArrayList<>(frag.length);
302+
for (TextFragment f : frag) {
303+
if (f != null && f.getScore() > 0.0f) {
304+
positive.add(f);
305+
}
306+
}
307+
frag = positive.toArray(new TextFragment[0]);
308+
}
309+
297310
// merge any contiguous fragments to improve readability
298311
if (mergeContiguousFragments) {
299312
mergeContiguousFragments(frag);
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.search.highlight;
18+
19+
import org.apache.lucene.analysis.TokenStream;
20+
import org.apache.lucene.index.Term;
21+
import org.apache.lucene.search.TermQuery;
22+
import org.apache.lucene.tests.analysis.MockAnalyzer;
23+
import org.apache.lucene.tests.analysis.MockTokenizer;
24+
import org.apache.lucene.tests.util.LuceneTestCase;
25+
26+
/**
27+
* Regression test for merging zero-scored fragments into scored fragments (#15333).
28+
*/
29+
public class TestZeroScoreMerging extends LuceneTestCase {
30+
31+
public void testZeroScoredFragmentsAreNotMergedIntoHighlights() throws Exception {
32+
// Build a text with large zero-matching regions around a single matching token "credit".
33+
String prefix = repeat('a', 130); // ensures at least one 100-char fragment with score 0
34+
String match = " credit ";
35+
String suffix = repeat('b', 130);
36+
String text = prefix + match + suffix;
37+
38+
MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
39+
40+
TermQuery query = new TermQuery(new Term("f", "credit"));
41+
QueryScorer scorer = new QueryScorer(query, "f");
42+
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), scorer);
43+
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100));
44+
45+
TokenStream ts = analyzer.tokenStream("f", text);
46+
String[] best = highlighter.getBestFragments(ts, text, 3);
47+
48+
// We expect only the fragment containing the match to be returned, not merged with neighbors
49+
assertEquals("Only the scored fragment should be returned", 1, best.length);
50+
assertTrue(
51+
"Returned fragment must contain the highlighted match",
52+
best[0].contains("<B>credit</B>") || best[0].contains("<b>credit</b>"));
53+
// And it should not be overly long (i.e., not a merge of 3x100-size fragments)
54+
assertTrue("Fragment should be near the configured size", best[0].length() <= 160);
55+
}
56+
57+
private static String repeat(char c, int count) {
58+
StringBuilder sb = new StringBuilder(count);
59+
for (int i = 0; i < count; i++) {
60+
sb.append(c);
61+
}
62+
return sb.toString();
63+
}
64+
}

0 commit comments

Comments
 (0)