Skip to content

Commit ba0d1ea

Browse files
authored
Add BB25 normalization for sparse encoders (#1046)
* Add log-odds conjunction fusion for BB25 hybrid search BB25 normalization outputs calibrated probabilities, but the existing hybrid fusion uses convex combination which discards the Bayesian probability semantics. This causes BB25 to regress on 4/5 BEIR datasets. Add log-odds conjunction fusion (from "From Bayesian Inference to Neural Computation") that correctly combines probability signals in logit space with per-query dynamic calibration for dense cosine scores. - scoring/normalize.py: Extract Bayesian method check into isbayes() - scoring/base.py: Add default isbayes() returning False - scoring/tfidf.py: Add isbayes() delegating to normalizer - search/base.py: Add logodds(), convex(), rrf() fusion methods; dispatch based on isbayes() BEIR nDCG@10 results (BB25+LogOdds vs Default): arguana +2.23, fiqa +2.03, scidocs +0.62, scifact +1.33, nfcorpus -1.96 * Extract Hybrid class for score fusion strategies Move logodds, convex, and rrf fusion methods from Search into a dedicated Hybrid class, following the same pattern as Normalize. * Fix coding convention issues in Hybrid class for CI - Fix black formatting: remove unnecessary parentheses, remove spaces around ** - Fix pylint too-many-branches: extract calibrate() method from logodds() - Fix pylint unused-variable: rename score to _ in rrf() * Add BB25 normalization for sparse encoders and fix IVFSparse topn bug - Support `normalize: bb25` config for sparse encoder scoring, enabling Bayesian sigmoid calibration as an alternative to default linear normalization. Reuses existing Normalize.bayes() infrastructure. - Fix dimension check in IVFSparse.topn(): use scores.shape[1] (number of data items) instead of scores.shape[0] (number of queries) for the argpartition kth bound check. The previous code caused ValueError when the number of centroids was less than nprobe. * Add tests for BB25 sparse normalization and IVFSparse topn fix
1 parent 8929992 commit ba0d1ea

File tree

4 files changed

+117
-5
lines changed

4 files changed

+117
-5
lines changed

src/python/txtai/ann/sparse/ivfsparse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ def topn(self, queries, data, limit, deletes=None):
269269
scores[:, deletes] = 0
270270

271271
# Get top n matching indices and scores
272-
indices = np.argpartition(-scores, limit if limit < scores.shape[0] else scores.shape[0] - 1)[:, :limit]
272+
indices = np.argpartition(-scores, limit if limit < scores.shape[1] else scores.shape[1] - 1)[:, :limit]
273273
scores = np.take_along_axis(scores, indices, axis=1)
274274

275275
return indices, scores

src/python/txtai/scoring/sparse.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from ..vectors import SparseVectorsFactory
1010

1111
from .base import Scoring
12+
from .normalize import Normalize
1213

1314

1415
class Sparse(Scoring):
@@ -33,9 +34,17 @@ def __init__(self, config=None, models=None):
3334
self.model = SparseVectorsFactory.create(config, models)
3435

3536
# Normalize search outputs if vectors are not normalized already
36-
# A float can also be provided to set the normalization factor (defaults to 30.0)
37+
# Supports: True (default linear, scale=30.0), float (custom scale),
38+
# "bb25"/"bayes" (Bayesian sigmoid calibration), False (disabled)
3739
self.isnormalize = self.config.get("normalize", True)
3840

41+
# Create Bayesian normalizer when a Bayesian method is configured
42+
self.normalizer = None
43+
if isinstance(self.isnormalize, (str, dict)):
44+
normalizer = Normalize(self.isnormalize)
45+
if normalizer.isbayes():
46+
self.normalizer = normalizer
47+
3948
# Sparse ANN
4049
self.ann = None
4150

@@ -131,7 +140,7 @@ def isnormalized(self):
131140
return self.isnormalize or self.model.isnormalize
132141

133142
def isbayes(self):
134-
return False
143+
return self.normalizer is not None
135144

136145
def start(self, checkpoint):
137146
"""
@@ -194,7 +203,12 @@ def stream(self):
194203

195204
def normalize(self, queries, scores):
196205
"""
197-
Normalize query result using the max query score.
206+
Normalize query results.
207+
208+
When Bayesian normalization is configured, applies sigmoid calibration
209+
with per-query adaptive parameters (beta=median, alpha=1/std) to produce
210+
calibrated probabilities in [0, 1]. Otherwise, applies linear normalization
211+
using the max query score.
198212
199213
Args:
200214
queries: query vectors
@@ -204,7 +218,11 @@ def normalize(self, queries, scores):
204218
normalized query results
205219
"""
206220

207-
# Get normalize scale factor
221+
# Bayesian sigmoid calibration
222+
if self.normalizer:
223+
return [self.normalizer.bayes(result) if result else [] for result in scores]
224+
225+
# Default linear normalization
208226
scale = 30.0 if isinstance(self.isnormalize, bool) else self.isnormalize
209227

210228
# Normalize scores using max scores

test/python/testann/testsparse.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,29 @@ def testIVFSparse(self):
8282
self.assertLess(len(ann.blocks), 15)
8383
ann.close()
8484

85+
def testIVFSparseTopnOverLimit(self):
86+
"""
87+
Test IVFSparse topn when limit exceeds the number of indexed documents
88+
"""
89+
90+
# Generate a small dataset (5 documents)
91+
data = self.generate(5, 30522)
92+
93+
ann = SparseANNFactory.create({"backend": "ivfsparse"})
94+
ann.index(data)
95+
96+
# Search with limit (10) greater than document count (5)
97+
results = ann.search(data[0], 10)
98+
self.assertGreater(len(results[0]), 0)
99+
100+
# Batch search with multiple queries exceeding document count
101+
results = ann.search(data, 10)
102+
self.assertEqual(len(results), data.shape[0])
103+
for result in results:
104+
self.assertGreater(len(result), 0)
105+
106+
ann.close()
107+
85108
@patch("sqlalchemy.orm.Query.limit")
86109
def testPGSparse(self, query):
87110
"""

test/python/testscoring/testsparse.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,77 @@ def testGPU(self, count):
102102
self.assertIsNotNone(scoring)
103103
scoring.close()
104104

105+
def testBayes(self):
106+
"""
107+
Test BB25 Bayesian normalization for sparse scoring
108+
"""
109+
110+
config = {
111+
"method": "sparse",
112+
"path": "sparse-encoder-testing/splade-bert-tiny-nq",
113+
"normalize": "bb25",
114+
}
115+
scoring = ScoringFactory.create(config)
116+
scoring.index((uid, {"text": text}, tags) for uid, text, tags in self.data)
117+
118+
# Verify Bayesian mode flags
119+
self.assertTrue(scoring.isbayes())
120+
self.assertTrue(scoring.isnormalized())
121+
122+
# Search and validate scores are calibrated probabilities in [0, 1]
123+
results = scoring.search("lottery ticket", 3)
124+
self.assertGreater(len(results), 0)
125+
for _, score in results:
126+
self.assertGreaterEqual(score, 0.0)
127+
self.assertLessEqual(score, 1.0)
128+
129+
# Batch search
130+
results = scoring.batchsearch(["lottery ticket", "ice shelf"], 3)
131+
self.assertEqual(len(results), 2)
132+
for query_results in results:
133+
for _, score in query_results:
134+
self.assertGreaterEqual(score, 0.0)
135+
self.assertLessEqual(score, 1.0)
136+
137+
scoring.close()
138+
139+
def testBayesDict(self):
140+
"""
141+
Test BB25 normalization with dict config
142+
"""
143+
144+
config = {
145+
"method": "sparse",
146+
"path": "sparse-encoder-testing/splade-bert-tiny-nq",
147+
"normalize": {"method": "bb25", "alpha": 2.0},
148+
}
149+
scoring = ScoringFactory.create(config)
150+
scoring.index((uid, {"text": text}, tags) for uid, text, tags in self.data)
151+
152+
self.assertTrue(scoring.isbayes())
153+
154+
results = scoring.search("lottery ticket", 3)
155+
self.assertGreater(len(results), 0)
156+
for _, score in results:
157+
self.assertGreaterEqual(score, 0.0)
158+
self.assertLessEqual(score, 1.0)
159+
160+
scoring.close()
161+
162+
def testBayesNonBayes(self):
163+
"""
164+
Test that non-Bayesian string normalize values do not activate Bayesian mode
165+
"""
166+
167+
config = {
168+
"method": "sparse",
169+
"path": "sparse-encoder-testing/splade-bert-tiny-nq",
170+
"normalize": "default",
171+
}
172+
scoring = ScoringFactory.create(config)
173+
self.assertFalse(scoring.isbayes())
174+
scoring.close()
175+
105176
def testIVFFlat(self):
106177
"""
107178
Test sparse vectors with IVFFlat clustering

0 commit comments

Comments
 (0)