Add BB25 normalization for sparse encoders (#1046)

jaepil · web-flow · commit ba0d1eaeadce · 2026-02-20T14:40:18.000-05:00
* Add log-odds conjunction fusion for BB25 hybrid search

BB25 normalization outputs calibrated probabilities, but the existing
hybrid fusion uses convex combination which discards the Bayesian
probability semantics. This causes BB25 to regress on 4/5 BEIR datasets.

Add log-odds conjunction fusion (from "From Bayesian Inference to Neural
Computation") that correctly combines probability signals in logit space
with per-query dynamic calibration for dense cosine scores.

- scoring/normalize.py: Extract Bayesian method check into isbayes()
- scoring/base.py: Add default isbayes() returning False
- scoring/tfidf.py: Add isbayes() delegating to normalizer
- search/base.py: Add logodds(), convex(), rrf() fusion methods;
  dispatch based on isbayes()

BEIR nDCG@10 results (BB25+LogOdds vs Default):
  arguana +2.23, fiqa +2.03, scidocs +0.62, scifact +1.33, nfcorpus -1.96

* Extract Hybrid class for score fusion strategies

Move logodds, convex, and rrf fusion methods from Search into
a dedicated Hybrid class, following the same pattern as Normalize.

* Fix coding convention issues in Hybrid class for CI

- Fix black formatting: remove unnecessary parentheses, remove spaces around **
- Fix pylint too-many-branches: extract calibrate() method from logodds()
- Fix pylint unused-variable: rename score to _ in rrf()

* Add BB25 normalization for sparse encoders and fix IVFSparse topn bug

- Support `normalize: bb25` config for sparse encoder scoring, enabling
  Bayesian sigmoid calibration as an alternative to default linear
  normalization. Reuses existing Normalize.bayes() infrastructure.

- Fix dimension check in IVFSparse.topn(): use scores.shape[1] (number
  of data items) instead of scores.shape[0] (number of queries) for the
  argpartition kth bound check. The previous code caused ValueError when
  the number of centroids was less than nprobe.

* Add tests for BB25 sparse normalization and IVFSparse topn fix
diff --git a/src/python/txtai/ann/sparse/ivfsparse.py b/src/python/txtai/ann/sparse/ivfsparse.py
@@ -269,7 +269,7 @@ def topn(self, queries, data, limit, deletes=None):
             scores[:, deletes] = 0
 
         # Get top n matching indices and scores
-        indices = np.argpartition(-scores, limit if limit < scores.shape[0] else scores.shape[0] - 1)[:, :limit]
+        indices = np.argpartition(-scores, limit if limit < scores.shape[1] else scores.shape[1] - 1)[:, :limit]
         scores = np.take_along_axis(scores, indices, axis=1)
 
         return indices, scores
diff --git a/src/python/txtai/scoring/sparse.py b/src/python/txtai/scoring/sparse.py
@@ -9,6 +9,7 @@
 from ..vectors import SparseVectorsFactory
 
 from .base import Scoring
+from .normalize import Normalize
 
 
 class Sparse(Scoring):
@@ -33,9 +34,17 @@ def __init__(self, config=None, models=None):
         self.model = SparseVectorsFactory.create(config, models)
 
         # Normalize search outputs if vectors are not normalized already
-        # A float can also be provided to set the normalization factor (defaults to 30.0)
+        # Supports: True (default linear, scale=30.0), float (custom scale),
+        #           "bb25"/"bayes" (Bayesian sigmoid calibration), False (disabled)
         self.isnormalize = self.config.get("normalize", True)
 
+        # Create Bayesian normalizer when a Bayesian method is configured
+        self.normalizer = None
+        if isinstance(self.isnormalize, (str, dict)):
+            normalizer = Normalize(self.isnormalize)
+            if normalizer.isbayes():
+                self.normalizer = normalizer
+
         # Sparse ANN
         self.ann = None
 
@@ -131,7 +140,7 @@ def isnormalized(self):
         return self.isnormalize or self.model.isnormalize
 
     def isbayes(self):
-        return False
+        return self.normalizer is not None
 
     def start(self, checkpoint):
         """
@@ -194,7 +203,12 @@ def stream(self):
 
     def normalize(self, queries, scores):
         """
-        Normalize query result using the max query score.
+        Normalize query results.
+
+        When Bayesian normalization is configured, applies sigmoid calibration
+        with per-query adaptive parameters (beta=median, alpha=1/std) to produce
+        calibrated probabilities in [0, 1]. Otherwise, applies linear normalization
+        using the max query score.
 
         Args:
             queries: query vectors
@@ -204,7 +218,11 @@ def normalize(self, queries, scores):
             normalized query results
         """
 
-        # Get normalize scale factor
+        # Bayesian sigmoid calibration
+        if self.normalizer:
+            return [self.normalizer.bayes(result) if result else [] for result in scores]
+
+        # Default linear normalization
         scale = 30.0 if isinstance(self.isnormalize, bool) else self.isnormalize
 
         # Normalize scores using max scores
diff --git a/test/python/testann/testsparse.py b/test/python/testann/testsparse.py
@@ -82,6 +82,29 @@ def testIVFSparse(self):
         self.assertLess(len(ann.blocks), 15)
         ann.close()
 
+    def testIVFSparseTopnOverLimit(self):
+        """
+        Test IVFSparse topn when limit exceeds the number of indexed documents
+        """
+
+        # Generate a small dataset (5 documents)
+        data = self.generate(5, 30522)
+
+        ann = SparseANNFactory.create({"backend": "ivfsparse"})
+        ann.index(data)
+
+        # Search with limit (10) greater than document count (5)
+        results = ann.search(data[0], 10)
+        self.assertGreater(len(results[0]), 0)
+
+        # Batch search with multiple queries exceeding document count
+        results = ann.search(data, 10)
+        self.assertEqual(len(results), data.shape[0])
+        for result in results:
+            self.assertGreater(len(result), 0)
+
+        ann.close()
+
     @patch("sqlalchemy.orm.Query.limit")
     def testPGSparse(self, query):
         """
diff --git a/test/python/testscoring/testsparse.py b/test/python/testscoring/testsparse.py
@@ -102,6 +102,77 @@ def testGPU(self, count):
         self.assertIsNotNone(scoring)
         scoring.close()
 
+    def testBayes(self):
+        """
+        Test BB25 Bayesian normalization for sparse scoring
+        """
+
+        config = {
+            "method": "sparse",
+            "path": "sparse-encoder-testing/splade-bert-tiny-nq",
+            "normalize": "bb25",
+        }
+        scoring = ScoringFactory.create(config)
+        scoring.index((uid, {"text": text}, tags) for uid, text, tags in self.data)
+
+        # Verify Bayesian mode flags
+        self.assertTrue(scoring.isbayes())
+        self.assertTrue(scoring.isnormalized())
+
+        # Search and validate scores are calibrated probabilities in [0, 1]
+        results = scoring.search("lottery ticket", 3)
+        self.assertGreater(len(results), 0)
+        for _, score in results:
+            self.assertGreaterEqual(score, 0.0)
+            self.assertLessEqual(score, 1.0)
+
+        # Batch search
+        results = scoring.batchsearch(["lottery ticket", "ice shelf"], 3)
+        self.assertEqual(len(results), 2)
+        for query_results in results:
+            for _, score in query_results:
+                self.assertGreaterEqual(score, 0.0)
+                self.assertLessEqual(score, 1.0)
+
+        scoring.close()
+
+    def testBayesDict(self):
+        """
+        Test BB25 normalization with dict config
+        """
+
+        config = {
+            "method": "sparse",
+            "path": "sparse-encoder-testing/splade-bert-tiny-nq",
+            "normalize": {"method": "bb25", "alpha": 2.0},
+        }
+        scoring = ScoringFactory.create(config)
+        scoring.index((uid, {"text": text}, tags) for uid, text, tags in self.data)
+
+        self.assertTrue(scoring.isbayes())
+
+        results = scoring.search("lottery ticket", 3)
+        self.assertGreater(len(results), 0)
+        for _, score in results:
+            self.assertGreaterEqual(score, 0.0)
+            self.assertLessEqual(score, 1.0)
+
+        scoring.close()
+
+    def testBayesNonBayes(self):
+        """
+        Test that non-Bayesian string normalize values do not activate Bayesian mode
+        """
+
+        config = {
+            "method": "sparse",
+            "path": "sparse-encoder-testing/splade-bert-tiny-nq",
+            "normalize": "default",
+        }
+        scoring = ScoringFactory.create(config)
+        self.assertFalse(scoring.isbayes())
+        scoring.close()
+
     def testIVFFlat(self):
         """
         Test sparse vectors with IVFFlat clustering