Highlighter: avoid merging zero-score fragments

kdt523 · kdt523 · commit b9596749f4c2 · 2025-10-22T00:34:11.000+05:30
MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used
Add regression tests and CHANGES entries
diff --git a/commitmsg.txt b/commitmsg.txt
@@ -0,0 +1,3 @@
+Highlighter: avoid merging zero-score fragments 
+MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used 
+Add regression tests and CHANGES entries
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -98,6 +98,10 @@ Bug Fixes
 
 * GITHUB#15125: Handle inconsistent schema on flush with index sorts (Nhat Nguyen)
 
+* GITHUB#15333: Highlighter: prevent zero-scored fragments from being merged with
+  adjacent fragments. This avoids producing merged passages that include content with
+  no matches. A regression test was added (TestZeroScoreMerging).
+
 Changes in Runtime Behavior
 ---------------------
 * GITHUB#14187: The query cache is now disabled by default. (Adrien Grand)
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+/**
+ * Regression test for a bug where MaxScoreBulkScorer could score past leaf maxDoc when a
+ * restrictive filter and disjunction were used together.
+ */
+public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase {
+
+  public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception {
+  Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    try (IndexWriter w = new IndexWriter(dir, iwc)) {
+      // Create a small index where one clause matches more docs than the other, and a restrictive
+      // filter
+      for (int i = 0; i < 200; i++) {
+        Document d = new Document();
+        // Clause A matches ~1/3
+        d.add(new StringField("a", (i % 3 == 0) ? "yes" : "no", Field.Store.NO));
+        // Clause B matches ~1/9
+        d.add(new StringField("b", (i % 9 == 0) ? "yes" : "no", Field.Store.NO));
+        // Restrictive filter matches ~1%
+        d.add(new StringField("f", (i % 100 == 0) ? "on" : "off", Field.Store.NO));
+        w.addDocument(d);
+      }
+    }
+
+    try (DirectoryReader reader = DirectoryReader.open(dir)) {
+      IndexSearcher searcher = new IndexSearcher(reader);
+
+      Query disjunction =
+          new BooleanQuery.Builder()
+              .add(new TermQuery(new Term("a", "yes")), BooleanClause.Occur.SHOULD)
+              .add(new TermQuery(new Term("b", "yes")), BooleanClause.Occur.SHOULD)
+              .build();
+
+      Query filter = new TermQuery(new Term("f", "on"));
+
+      Query filtered =
+          new BooleanQuery.Builder()
+              .add(disjunction, BooleanClause.Occur.SHOULD)
+              .add(filter, BooleanClause.Occur.FILTER)
+              .build();
+
+      // This triggers TOP_SCORES path internally; just execute to ensure no exceptions
+      TopDocs td = searcher.search(filtered, 10);
+      assertNotNull(td);
+    } finally {
+      dir.close();
+    }
+  }
+}
+
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@@ -294,6 +294,19 @@ public final TextFragment[] getBestTextFragments(
       // return the most relevant fragments
       TextFragment[] frag = fragQueue.drainToArrayHighestFirst(TextFragment[]::new);
 
+      // Filter out zero-scored fragments BEFORE merging so that non-matching
+      // text cannot be merged into matching fragments and "inherit" a positive score.
+      // This prevents large zero-content blocks from polluting the highlights.
+      if (frag.length > 0) {
+        ArrayList<TextFragment> positive = new ArrayList<>(frag.length);
+        for (TextFragment f : frag) {
+          if (f != null && f.getScore() > 0.0f) {
+            positive.add(f);
+          }
+        }
+        frag = positive.toArray(new TextFragment[0]);
+      }
+
       // merge any contiguous fragments to improve readability
       if (mergeContiguousFragments) {
         mergeContiguousFragments(frag);
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestZeroScoreMerging.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.highlight;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.tests.analysis.MockAnalyzer;
+import org.apache.lucene.tests.analysis.MockTokenizer;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+/**
+ * Regression test for merging zero-scored fragments into scored fragments (#15333).
+ */
+public class TestZeroScoreMerging extends LuceneTestCase {
+
+  public void testZeroScoredFragmentsAreNotMergedIntoHighlights() throws Exception {
+    // Build a text with large zero-matching regions around a single matching token "credit".
+    String prefix = repeat('a', 130); // ensures at least one 100-char fragment with score 0
+    String match = " credit ";
+    String suffix = repeat('b', 130);
+    String text = prefix + match + suffix;
+
+    MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+
+    TermQuery query = new TermQuery(new Term("f", "credit"));
+    QueryScorer scorer = new QueryScorer(query, "f");
+    Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(), scorer);
+    highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100));
+
+    TokenStream ts = analyzer.tokenStream("f", text);
+    String[] best = highlighter.getBestFragments(ts, text, 3);
+
+    // We expect only the fragment containing the match to be returned, not merged with neighbors
+    assertEquals("Only the scored fragment should be returned", 1, best.length);
+    assertTrue(
+        "Returned fragment must contain the highlighted match",
+        best[0].contains("<B>credit</B>") || best[0].contains("<b>credit</b>"));
+    // And it should not be overly long (i.e., not a merge of 3x100-size fragments)
+    assertTrue("Fragment should be near the configured size", best[0].length() <= 160);
+  }
+
+  private static String repeat(char c, int count) {
+    StringBuilder sb = new StringBuilder(count);
+    for (int i = 0; i < count; i++) {
+      sb.append(c);
+    }
+    return sb.toString();
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Highlighter: avoid merging zero-score fragments`
	`2`	`+MaxScoreBulkScorer: clamp candidate advancement to leaf bounds when filtered disjunctions are used`
	`3`	`+Add regression tests and CHANGES entries`