Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/python/txtai/embeddings/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .base import Search
from .errors import *
from .explain import Explain
from .hybrid import Hybrid
from .ids import Ids
from .query import Query
from .scan import Scan
Expand Down
25 changes: 4 additions & 21 deletions src/python/txtai/embeddings/search/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging

from .errors import IndexNotFoundError
from .hybrid import Hybrid
from .scan import Scan

# Logging configuration
Expand Down Expand Up @@ -113,27 +114,9 @@ def search(self, queries, limit, weights, index):
if isinstance(weights, (int, float)):
weights = [weights, 1 - weights]

# Create weighted scores
results = []
for vectors in zip(dense, sparse):
uids = {}
for v, scores in enumerate(vectors):
for r, (uid, score) in enumerate(scores if weights[v] > 0 else []):
# Initialize score
if uid not in uids:
uids[uid] = 0.0

# Create hybrid score
# - Convex Combination when sparse scores are normalized
# - Reciprocal Rank Fusion (RRF) when sparse scores aren't normalized
if self.scoring.isnormalized():
uids[uid] += score * weights[v]
else:
uids[uid] += (1.0 / (r + 1)) * weights[v]

results.append(sorted(uids.items(), key=lambda x: x[1], reverse=True)[:limit])

return results
# Create weighted scores via hybrid fusion strategy
fusion = Hybrid(self.scoring)
return [fusion(vectors, weights, limit) for vectors in zip(dense, sparse)]

# Raise an error if when no indexes are available
if not sparse and not dense:
Expand Down
199 changes: 199 additions & 0 deletions src/python/txtai/embeddings/search/hybrid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
"""
Hybrid module
"""

import math


# Numerical clamp for log-odds computation
_EPSILON = 1e-10


class Hybrid:
"""
Hybrid score fusion strategies for combining dense and sparse search results.

Selects a fusion method based on the sparse scoring configuration:
- Log-odds conjunction for Bayesian (BB25) normalized scores
- Convex combination for default normalized scores
- Reciprocal Rank Fusion (RRF) for unnormalized scores
"""

def __init__(self, scoring):
"""
Creates a new Hybrid instance.

Args:
scoring: sparse scoring instance
"""

if scoring.isbayes():
self.method = self.logodds
elif scoring.isnormalized():
self.method = self.convex
else:
self.method = self.rrf

def __call__(self, vectors, weights, limit):
"""
Fuses dense and sparse result vectors into a single ranked list.

Args:
vectors: tuple of (dense_results, sparse_results)
weights: [dense_weight, sparse_weight]
limit: maximum results

Returns:
sorted list of (uid, score)
"""

return self.method(vectors, weights, limit)

def calibrate(self, dense_raw):
"""
Computes per-query calibration parameters for dense cosine scores.

Uses the same approach as BB25: beta=median, alpha_eff=1/std so the
logit for a dense score is alpha * (score - median), centering the
median candidate at logit 0.

Args:
dense_raw: list of raw dense cosine scores

Returns:
(median, alpha) calibration parameters
"""

d_median, d_alpha = 0.0, 1.0

dense_arr = [s for s in dense_raw if s > 0]
if dense_arr:
d_median = sorted(dense_arr)[len(dense_arr) // 2]
d_std = (sum((x - sum(dense_arr) / len(dense_arr)) ** 2 for x in dense_arr) / len(dense_arr)) ** 0.5
d_alpha = 1.0 / d_std if d_std > 0 else 1.0

return d_median, d_alpha

def logodds(self, vectors, weights, limit):
"""
Log-odds conjunction fusion for Bayesian (BB25) normalized scores.

Implements the framework from "From Bayesian Inference to Neural Computation"
(Jeong, 2026) with asymmetric dynamic calibration:

1. Calibrate dense cosine scores via per-query dynamic sigmoid
(beta=median, alpha_eff=1/std) to produce logits centered at 0.
2. Convert sparse BB25 probabilities to logits.
3. Fuse via weighted mean log-odds with confidence scaling.

Scores are returned as raw logits (not mapped back through sigmoid) to
preserve ranking resolution among top candidates.

Args:
vectors: tuple of (dense_results, sparse_results)
weights: [dense_weight, sparse_weight]
limit: maximum results

Returns:
sorted list of (uid, score) where score is a fused logit
"""

# Phase 1: Collect raw scores per document
uids = {}
dense_raw = []
for v, scores in enumerate(vectors):
for uid, score in scores if weights[v] > 0 else []:
if uid not in uids:
uids[uid] = [None, None]

if v == 0:
uids[uid][0] = score
dense_raw.append(score)
else:
# Sparse BB25 score: already a calibrated probability
uids[uid][1] = score

# Phase 2: Compute per-query calibration parameters for dense cosine scores.
# Same approach as BB25: beta=median, alpha_eff=1/std. The logit for a dense
# score is alpha * (score - median), centering the median candidate at logit 0.
d_median, d_alpha = self.calibrate(dense_raw)

# Phase 3: Fuse via weighted mean log-odds with confidence scaling.
# Raw logit scores are used for ranking instead of sigmoid(logit) to
# preserve fine-grained ordering among top candidates.
fused = {}
n = 2
alpha = 0.5
scale = n**alpha

for uid, pair in uids.items():
raw_dense = pair[0]
p_sparse = pair[1]

if raw_dense is not None and p_sparse is not None:
# Calibrate dense score via dynamic sigmoid
logit_d = d_alpha * (raw_dense - d_median)
logit_d = max(min(logit_d, 500), -500)

# Sparse BB25 score -> logit
p_s = min(max(p_sparse, _EPSILON), 1.0 - _EPSILON)
logit_s = math.log(p_s / (1.0 - p_s))

# Weighted mean log-odds with confidence scaling (Paper 2, Def 4.2.1)
l_bar = weights[0] * logit_d + weights[1] * logit_s
fused[uid] = l_bar * scale
elif raw_dense is not None:
# Only dense signal: calibrated logit scaled by weight
logit_d = d_alpha * (raw_dense - d_median)
logit_d = max(min(logit_d, 500), -500)
fused[uid] = logit_d * weights[0]
else:
# Only sparse signal: logit scaled by weight
p_s = min(max(p_sparse, _EPSILON), 1.0 - _EPSILON)
fused[uid] = math.log(p_s / (1.0 - p_s)) * weights[1]

return sorted(fused.items(), key=lambda x: x[1], reverse=True)[:limit]

def convex(self, vectors, weights, limit):
"""
Convex combination fusion for default normalized scores.

Args:
vectors: tuple of (dense_results, sparse_results)
weights: [dense_weight, sparse_weight]
limit: maximum results

Returns:
sorted list of (uid, score)
"""

uids = {}
for v, scores in enumerate(vectors):
for uid, score in scores if weights[v] > 0 else []:
if uid not in uids:
uids[uid] = 0.0
uids[uid] += score * weights[v]

return sorted(uids.items(), key=lambda x: x[1], reverse=True)[:limit]

def rrf(self, vectors, weights, limit):
"""
Reciprocal Rank Fusion for unnormalized scores.

Args:
vectors: tuple of (dense_results, sparse_results)
weights: [dense_weight, sparse_weight]
limit: maximum results

Returns:
sorted list of (uid, score)
"""

uids = {}
for v, scores in enumerate(vectors):
for r, (uid, _) in enumerate(scores if weights[v] > 0 else []):
if uid not in uids:
uids[uid] = 0.0
uids[uid] += (1.0 / (r + 1)) * weights[v]

return sorted(uids.items(), key=lambda x: x[1], reverse=True)[:limit]
10 changes: 10 additions & 0 deletions src/python/txtai/scoring/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,13 @@ def isnormalized(self):
"""

raise NotImplementedError

def isbayes(self):
"""
Check if this scoring instance uses Bayesian (BB25) normalization.

Returns:
True if BB25/Bayesian normalization is active, False otherwise
"""

return False
17 changes: 14 additions & 3 deletions src/python/txtai/scoring/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,19 @@ def __init__(self, config):
self.beta = self.config.get("beta")
self.beta = float(self.beta) if self.beta is not None else self.beta

# BB25-compatible aliases for Bayesian normalization mode.
BAYESIAN_METHODS = ("bayes", "bayesian", "bayesian-bm25", "bb25")

def isbayes(self):
"""
Checks if Bayesian normalization mode is active.

Returns:
True if using BB25/Bayesian normalization
"""

return self.method in self.BAYESIAN_METHODS

def __call__(self, scores, avgscore):
"""
Normalizes scores.
Expand All @@ -45,9 +58,7 @@ def __call__(self, scores, avgscore):
normalized scores
"""

# BB25-compatible aliases for Bayesian normalization mode.
bayesian = ("bayes", "bayesian", "bayesian-bm25", "bb25")
return self.bayes(scores) if self.method in bayesian else self.default(scores, avgscore)
return self.bayes(scores) if self.isbayes() else self.default(scores, avgscore)

def default(self, scores, avgscore):
"""
Expand Down
3 changes: 3 additions & 0 deletions src/python/txtai/scoring/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,9 @@ def issparse(self):
def isnormalized(self):
return self.normalize

def isbayes(self):
return self.normalizer is not None and self.normalizer.isbayes()

def computefreq(self, tokens):
"""
Computes token frequency. Used for token weighting.
Expand Down