Ensure cross_fields always uses valid term statistics (#90316)

jtibshirani · web-flow · commit 7b3bdbe0a46e · 2022-09-23T12:26:02.000-07:00
In #89016 we adjusted the `cross_fields` scoring formula to prevent negative scores. This fix accidentally dropped another important fix that was added in document frequency (`actualDf`) and the minimum total term frequency (`minTTF`). Otherwise, we can produce invalid term statistics where the total term frequency is less than the document frequency. Fixes #90275
diff --git a/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java
@@ -152,7 +152,15 @@ protected int compare(int i, int j) {
             }
 
             int docCount = reader.getDocCount(terms[i].field());
-            int newDocFreq = Math.min(actualDf, docCount);
+
+            // IMPORTANT: we make two adjustments here to ensure the new document frequency is valid:
+            // 1. We take a minimum with docCount, which is the total number of documents that contain
+            // this field. The document frequency must always be less than the document count.
+            // 2. We also take a minimum with maxDoc. Earlier, maxDoc is adjusted to the minimum of
+            // maxDoc and minTTF. So taking the minimum ensures that the document frequency is never
+            // greater than the total term frequency, which would be illegal.
+            int newDocFreq = Math.min(Math.min(actualDf, docCount), maxDoc);
+
             contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq);
             prev = current;
             sumTTF += ctx.totalTermFreq();
diff --git a/server/src/test/java/org/apache/lucene/queries/BlendedTermQueryTests.java b/server/src/test/java/org/apache/lucene/queries/BlendedTermQueryTests.java
@@ -40,8 +40,10 @@
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import static org.hamcrest.Matchers.containsInAnyOrder;
@@ -233,16 +235,22 @@ public void testMinTTF() throws IOException {
             Document d = new Document();
             d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
             d.add(new Field("dense", "foo foo foo", ft));
-            if (i % 10 == 0) {
+            if (i % 2 == 0) {
                 d.add(new Field("sparse", "foo", ft));
             }
+            if (i % 10 == 0) {
+                d.add(new Field("more_sparse", "foo", ft));
+            }
             w.addDocument(d);
         }
+
         w.commit();
+        w.forceMerge(1);
+
         DirectoryReader reader = DirectoryReader.open(w);
         IndexSearcher searcher = setSimilarity(newSearcher(reader));
         {
-            String[] fields = new String[] { "dense", "sparse" };
+            String[] fields = new String[] { "dense", "sparse", "more_sparse" };
             Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f);
             TopDocs search = searcher.search(query, 10);
             ScoreDoc[] scoreDocs = search.scoreDocs;
@@ -253,6 +261,55 @@ public void testMinTTF() throws IOException {
         dir.close();
     }
 
+    public void testRandomFields() throws IOException {
+        Directory dir = newDirectory();
+        IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
+        FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+        ft.freeze();
+
+        Map<String, Float> fields = new HashMap<>();
+        fields.put("field", 1.0f);
+
+        int numRandomFields = random().nextInt(7);
+        for (int i = 0; i < numRandomFields; i++) {
+            String field = "field" + i;
+            float probability = randomBoolean() ? 1.0f : randomFloat();
+            fields.put(field, probability);
+        }
+
+        int numDocs = atLeast(100);
+        for (int i = 0; i < numDocs; i++) {
+            Document d = new Document();
+            for (Map.Entry<String, Float> entry : fields.entrySet()) {
+                String field = entry.getKey();
+                float probability = entry.getValue();
+                if (randomFloat() < probability) {
+                    String value = randomBoolean() ? "foo" : "foo foo foo";
+                    d.add(new Field(field, value, ft));
+                }
+                if (randomFloat() < probability) {
+                    d.add(new Field(field, "bar bar", ft));
+                }
+            }
+            w.addDocument(d);
+        }
+
+        w.commit();
+
+        DirectoryReader reader = DirectoryReader.open(w);
+        IndexSearcher searcher = setSimilarity(newSearcher(reader));
+        {
+            String[] fieldNames = fields.keySet().toArray(new String[0]);
+            Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fieldNames, "foo"), 0.1f);
+            TopDocs search = searcher.search(query, 10);
+            assertTrue(search.totalHits.value > 0);
+            assertTrue(search.scoreDocs.length > 0);
+        }
+        reader.close();
+        w.close();
+        dir.close();
+    }
+
     public void testMissingFields() throws IOException {
         Directory dir = newDirectory();
         IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));