Skip to content

Commit 7b3bdbe

Browse files
authored
Ensure cross_fields always uses valid term statistics (#90316)
In #89016 we adjusted the `cross_fields` scoring formula to prevent negative scores. This fix accidentally dropped another important fix that was added in document frequency (`actualDf`) and the minimum total term frequency (`minTTF`). Otherwise, we can produce invalid term statistics where the total term frequency is less than the document frequency. Fixes #90275
1 parent 83c19ae commit 7b3bdbe

File tree

2 files changed

+68
-3
lines changed

2 files changed

+68
-3
lines changed

server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,15 @@ protected int compare(int i, int j) {
152152
}
153153

154154
int docCount = reader.getDocCount(terms[i].field());
155-
int newDocFreq = Math.min(actualDf, docCount);
155+
156+
// IMPORTANT: we make two adjustments here to ensure the new document frequency is valid:
157+
// 1. We take a minimum with docCount, which is the total number of documents that contain
158+
// this field. The document frequency must always be less than the document count.
159+
// 2. We also take a minimum with maxDoc. Earlier, maxDoc is adjusted to the minimum of
160+
// maxDoc and minTTF. So taking the minimum ensures that the document frequency is never
161+
// greater than the total term frequency, which would be illegal.
162+
int newDocFreq = Math.min(Math.min(actualDf, docCount), maxDoc);
163+
156164
contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq);
157165
prev = current;
158166
sumTTF += ctx.totalTermFreq();

server/src/test/java/org/apache/lucene/queries/BlendedTermQueryTests.java

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,10 @@
4040
import java.io.IOException;
4141
import java.util.Arrays;
4242
import java.util.Collections;
43+
import java.util.HashMap;
4344
import java.util.HashSet;
4445
import java.util.List;
46+
import java.util.Map;
4547
import java.util.Set;
4648

4749
import static org.hamcrest.Matchers.containsInAnyOrder;
@@ -233,16 +235,22 @@ public void testMinTTF() throws IOException {
233235
Document d = new Document();
234236
d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
235237
d.add(new Field("dense", "foo foo foo", ft));
236-
if (i % 10 == 0) {
238+
if (i % 2 == 0) {
237239
d.add(new Field("sparse", "foo", ft));
238240
}
241+
if (i % 10 == 0) {
242+
d.add(new Field("more_sparse", "foo", ft));
243+
}
239244
w.addDocument(d);
240245
}
246+
241247
w.commit();
248+
w.forceMerge(1);
249+
242250
DirectoryReader reader = DirectoryReader.open(w);
243251
IndexSearcher searcher = setSimilarity(newSearcher(reader));
244252
{
245-
String[] fields = new String[] { "dense", "sparse" };
253+
String[] fields = new String[] { "dense", "sparse", "more_sparse" };
246254
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f);
247255
TopDocs search = searcher.search(query, 10);
248256
ScoreDoc[] scoreDocs = search.scoreDocs;
@@ -253,6 +261,55 @@ public void testMinTTF() throws IOException {
253261
dir.close();
254262
}
255263

264+
public void testRandomFields() throws IOException {
265+
Directory dir = newDirectory();
266+
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
267+
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
268+
ft.freeze();
269+
270+
Map<String, Float> fields = new HashMap<>();
271+
fields.put("field", 1.0f);
272+
273+
int numRandomFields = random().nextInt(7);
274+
for (int i = 0; i < numRandomFields; i++) {
275+
String field = "field" + i;
276+
float probability = randomBoolean() ? 1.0f : randomFloat();
277+
fields.put(field, probability);
278+
}
279+
280+
int numDocs = atLeast(100);
281+
for (int i = 0; i < numDocs; i++) {
282+
Document d = new Document();
283+
for (Map.Entry<String, Float> entry : fields.entrySet()) {
284+
String field = entry.getKey();
285+
float probability = entry.getValue();
286+
if (randomFloat() < probability) {
287+
String value = randomBoolean() ? "foo" : "foo foo foo";
288+
d.add(new Field(field, value, ft));
289+
}
290+
if (randomFloat() < probability) {
291+
d.add(new Field(field, "bar bar", ft));
292+
}
293+
}
294+
w.addDocument(d);
295+
}
296+
297+
w.commit();
298+
299+
DirectoryReader reader = DirectoryReader.open(w);
300+
IndexSearcher searcher = setSimilarity(newSearcher(reader));
301+
{
302+
String[] fieldNames = fields.keySet().toArray(new String[0]);
303+
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fieldNames, "foo"), 0.1f);
304+
TopDocs search = searcher.search(query, 10);
305+
assertTrue(search.totalHits.value > 0);
306+
assertTrue(search.scoreDocs.length > 0);
307+
}
308+
reader.close();
309+
w.close();
310+
dir.close();
311+
}
312+
256313
public void testMissingFields() throws IOException {
257314
Directory dir = newDirectory();
258315
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));

0 commit comments

Comments
 (0)