diff --git a/commitmsg.txt b/commitmsg.txt new file mode 100644 index 000000000000..d3df0f0be19f --- /dev/null +++ b/commitmsg.txt @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +Highlighter: avoid merging zero-score fragments +MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used +Add regression tests and CHANGES entries diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 119636b4b040..3673a09281f7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -98,6 +98,10 @@ Bug Fixes * GITHUB#15125: Handle inconsistent schema on flush with index sorts (Nhat Nguyen) +* GITHUB#15333: Highlighter: prevent zero-scored fragments from being merged with + adjacent fragments. This avoids producing merged passages that include content with + no matches. A regression test was added (TestZeroScoreMerging). + Changes in Runtime Behavior --------------------- * GITHUB#14187: The query cache is now disabled by default. (Adrien Grand) diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java new file mode 100644 index 000000000000..04c512c57d11 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.util.LuceneTestCase; + +/** + * Regression test for a bug where MaxScoreBulkScorer could score past leaf maxDoc when a + * restrictive filter and disjunction were used together. + */ +public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase { + + public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(); + try (IndexWriter w = new IndexWriter(dir, iwc)) { + // Create a small index where one clause matches more docs than the other, and a restrictive + // filter + for (int i = 0; i < 200; i++) { + Document d = new Document(); + // Clause A matches ~1/3 + d.add(new StringField("a", (i % 3 == 0) ? "yes" : "no", Field.Store.NO)); + // Clause B matches ~1/9 + d.add(new StringField("b", (i % 9 == 0) ? "yes" : "no", Field.Store.NO)); + // Restrictive filter matches ~1% + d.add(new StringField("f", (i % 100 == 0) ? "on" : "off", Field.Store.NO)); + w.addDocument(d); + } + } + + try (DirectoryReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = new IndexSearcher(reader); + + Query disjunction = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("a", "yes")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("b", "yes")), BooleanClause.Occur.SHOULD) + .build(); + + Query filter = new TermQuery(new Term("f", "on")); + + Query filtered = + new BooleanQuery.Builder() + .add(disjunction, BooleanClause.Occur.SHOULD) + .add(filter, BooleanClause.Occur.FILTER) + .build(); + + // This triggers TOP_SCORES path internally; just execute to ensure no exceptions + TopDocs td = searcher.search(filtered, 10); + assertNotNull(td); + } finally { + dir.close(); + } + } +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java index ad2873dde8af..4e5374d6ace6 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java @@ -294,6 +294,19 @@ public final TextFragment[] getBestTextFragments( // return the most relevant fragments TextFragment[] frag = fragQueue.drainToArrayHighestFirst(TextFragment[]::new); + // Filter out zero-scored fragments BEFORE merging so that non-matching + // text cannot be merged into matching fragments and "inherit" a positive score. + // This prevents large zero-content blocks from polluting the highlights. + if (frag.length > 0) { + ArrayList positive = new ArrayList<>(frag.length); + for (TextFragment f : frag) { + if (f != null && f.getScore() > 0.0f) { + positive.add(f); + } + } + frag = positive.toArray(new TextFragment[0]); + } + // merge any contiguous fragments to improve readability if (mergeContiguousFragments) { mergeContiguousFragments(frag); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java new file mode 100644 index 000000000000..c537972e2007 --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.highlight; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.util.LuceneTestCase; + +/** Regression test for merging zero-scored fragments into scored fragments (#15333). */ +public class TestZeroScoreMerging extends LuceneTestCase { + + public void testZeroScoredFragmentsAreNotMergedIntoHighlights() throws Exception { + // Build a text with large zero-matching regions around a single matching token "credit". + String prefix = repeat('a', 130); // ensures at least one 100-char fragment with score 0 + String match = " credit "; + String suffix = repeat('b', 130); + String text = prefix + match + suffix; + + MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + + TermQuery query = new TermQuery(new Term("f", "credit")); + QueryScorer scorer = new QueryScorer(query, "f"); + Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), scorer); + highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100)); + + TokenStream ts = analyzer.tokenStream("f", text); + String[] best = highlighter.getBestFragments(ts, text, 3); + + // We expect only the fragment containing the match to be returned, not merged with neighbors + assertEquals("Only the scored fragment should be returned", 1, best.length); + assertTrue( + "Returned fragment must contain the highlighted match", + best[0].contains("credit") || best[0].contains("credit")); + // And it should not be overly long (i.e., not a merge of 3x100-size fragments) + assertTrue("Fragment should be near the configured size", best[0].length() <= 160); + } + + private static String repeat(char c, int count) { + StringBuilder sb = new StringBuilder(count); + for (int i = 0; i < count; i++) { + sb.append(c); + } + return sb.toString(); + } +}