|
23 | 23 | import java.util.stream.DoubleStream; |
24 | 24 |
|
25 | 25 | /** |
26 | | - * 基于 RRF 算法的后处理器。 |
| 26 | + * RRF (Reciprocal Rank Fusion) Algorithm Post-Processor. |
| 27 | + * <p> |
| 28 | + * A lightweight rank fusion algorithm for merging multiple sorted lists (e.g., search results, recommendation lists). |
| 29 | + * Core idea: Generates a unified ranking by calculating the reciprocal weighted sum of document ranks across lists. Key |
| 30 | + * features: |
| 31 | + * </p> |
| 32 | + * <ul> |
| 33 | + * <li>No score normalization required</li> |
| 34 | + * <li>Strong robustness to anomalous rankings</li> |
| 35 | + * <li>Computationally efficient</li> |
| 36 | + * </ul> |
27 | 37 | * |
| 38 | + * <p><b>Core Formula:</b></p> |
| 39 | + * RRF_Score(d) = Σ [ 1 / (k + rankᵢ(d)) ] |
| 40 | + * <ul> |
| 41 | + * <li><b>d</b>: Document to be ranked</li> |
| 42 | + * <li><b>rankᵢ(d)</b>: Rank of document d in the i-th list (counting starts from 1)</li> |
| 43 | + * <li><b>k</b>: Smoothing constant (default 60), adjusts low-rank document contribution</li> |
| 44 | + * </ul> |
| 45 | + * |
| 46 | + * <p><b>Algorithm Workflow:</b></p> |
| 47 | + * <ol> |
| 48 | + * <li>Initialize hash table: Stores document ID → cumulative RRF score</li> |
| 49 | + * <li>Traverse each sorted list: |
| 50 | + * <pre>{@code |
| 51 | + * for (List<Document> list : allLists) { |
| 52 | + * for (int rank = 1; rank <= list.size(); rank++) { |
| 53 | + * double score = 1.0 / (k + rank); |
| 54 | + * map.put(docId, map.getOrDefault(docId, 0.0) + score); |
| 55 | + * } |
| 56 | + * } |
| 57 | + * }</pre> |
| 58 | + * </li> |
| 59 | + * <li>Sort by total score descending to generate final ranking</li> |
| 60 | + * </ol> |
| 61 | + * |
| 62 | + * <p><b>Parameter Functionality:</b></p> |
| 63 | + * <ul> |
| 64 | + * <li><b>Smaller k</b> → Amplifies top-ranked documents (e.g., k=1: rank1=0.5, rank2=0.33)</li> |
| 65 | + * <li><b>Larger k</b> → Increases low-rank influence (e.g., k=100: rank1≈0.01, rank100=0.005)</li> |
| 66 | + * <li><b>Default k=60</b>: Empirical value balancing rank influence</li> |
| 67 | + * </ul> |
| 68 | + * |
| 69 | + * <p><b>Applicable Scenarios:</b></p> |
| 70 | + * <ul> |
| 71 | + * <li>✅ <b>Hybrid Search</b>: Fusing keyword retrieval (BM25) and vector search results</li> |
| 72 | + * <li>✅ <b>RAG Systems</b>: Merging outputs from multiple retrievers (e.g., BM25/Dense Retrieval)</li> |
| 73 | + * <li>✅ <b>Multi-strategy Recommendations</b>: Combining collaborative/content-based filtering lists</li> |
| 74 | + * </ul> |
| 75 | + * |
| 76 | + * <p><b>Important Notes:</b></p> |
| 77 | + * <ul> |
| 78 | + * <li>Result quality depends on input ranking quality (low-quality inputs amplify bias)</li> |
| 79 | + * <li>Documents missing from a list contribute no score for that list</li> |
| 80 | + * <li>Final scores are only for relative ordering (not comparable across queries)</li> |
| 81 | + * <li>Java implementation considerations: |
| 82 | + * <ul> |
| 83 | + * <li>Use {@link Map}{@code <}{@link String}{@code , }{@link Double}{@code >} for document scores</li> |
| 84 | + * <li>Sort with {@code PriorityQueue} or {@code Stream.sorted()}</li> |
| 85 | + * </ul> |
| 86 | + * </li> |
| 87 | + * </ul> |
| 88 | + * <p> |
| 89 | + * ------------------------------------------------------------------------------------------ |
| 90 | + * <p> |
| 91 | + * 基于 RRF(Reciprocal Rank Fusion) 算法的后处理器。 |
| 92 | + * <p> |
| 93 | + * 轻量级排名融合算法,用于合并多个排序列表(如搜索结果、推荐列表)。核心思想是对文档在不同列表中的排名取倒数加权求和,生成统一排序。特点包括: |
| 94 | + * </p> |
| 95 | + * <ul> |
| 96 | + * <li>无需分数归一化</li> |
| 97 | + * <li>对异常排名鲁棒性强</li> |
| 98 | + * <li>计算高效</li> |
| 99 | + * </ul> |
| 100 | + * |
| 101 | + * <p><b>核心公式:</b></p> |
| 102 | + * RRF_Score(d) = Σ [ 1 / (k + rankᵢ(d)) ] |
| 103 | + * <ul> |
| 104 | + * <li><b>d</b>: 表示待排序文档</li> |
| 105 | + * <li><b>rankᵢ(d)</b>: 表示文档 d 在第 i 个排序列表中的排名(从 1 开始计数)</li> |
| 106 | + * <li><b>k</b>: 表示平滑常数(默认 60),用于调节低排名文档的贡献度</li> |
| 107 | + * </ul> |
| 108 | + * |
| 109 | + * <p><b>算法流程:</b></p> |
| 110 | + * <ol> |
| 111 | + * <li>初始化哈希表:存储文档 ID → 累积 RRF 分数</li> |
| 112 | + * <li>遍历每个排序列表: |
| 113 | + * <pre>{@code |
| 114 | + * for (List<Document> list : allLists) { |
| 115 | + * for (int rank = 1; rank <= list.size(); rank++) { |
| 116 | + * double score = 1.0 / (k + rank); |
| 117 | + * map.put(docId, map.getOrDefault(docId, 0.0) + score); |
| 118 | + * } |
| 119 | + * } |
| 120 | + * }</pre> |
| 121 | + * </li> |
| 122 | + * <li>按总分降序排序生成最终融合排名</li> |
| 123 | + * </ol> |
| 124 | + * |
| 125 | + * <p><b>参数作用:</b></p> |
| 126 | + * <ul> |
| 127 | + * <li><b>k 值越小</b> → 高排名文档优势放大(如 k = 1 时,第 1 名得分 0.5,第 2 名 0.33)</li> |
| 128 | + * <li><b>k 值越大</b> → 低排名文档影响力提升(如 k = 100 时,第 1 名得分 ≈ 0.01,第 100 名 0.005)</li> |
| 129 | + * <li><b>默认 k = 60</b>:经验值,平衡高/低排名影响力</li> |
| 130 | + * </ul> |
| 131 | + * |
| 132 | + * <p><b>适用场景:</b></p> |
| 133 | + * <ul> |
| 134 | + * <li>✅ <b>混合搜索</b>:融合关键词检索(BM25)与向量搜索结果</li> |
| 135 | + * <li>✅ <b>RAG 系统</b>:合并多检索器(如 BM25 / Dense Retrieval)的输出</li> |
| 136 | + * <li>✅ <b>多策略推荐</b>:融合协同过滤、内容过滤的推荐列表</li> |
| 137 | + * </ul> |
| 138 | + * |
| 139 | + * <p><b>注意事项:</b></p> |
| 140 | + * <ul> |
| 141 | + * <li>输入列表的排名质量直接影响结果(低质量输入会放大偏差)</li> |
| 142 | + * <li>若文档未出现在某列表中,则忽略该列表贡献</li> |
| 143 | + * <li>最终分数仅用于排序,无绝对语义(不可跨查询比较)</li> |
| 144 | + * <li>Java 实现时需注意: |
| 145 | + * <ul> |
| 146 | + * <li>使用 {@link Map}{@code <}{@link String}{@code , }{@link Double}{@code >} 存储文档得分</li> |
| 147 | + * <li>排序推荐使用 {@code PriorityQueue} 或 {@code Stream.sorted()}</li> |
| 148 | + * </ul> |
| 149 | + * </li> |
| 150 | + * </ul> |
| 151 | + * |
| 152 | + * @see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html">Elasticsearch RRF |
| 153 | + * Documentation</a> |
| 154 | + * @see <a href="https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf">Cormack et al. (2009) "Reciprocal Rank |
| 155 | + * Fusion Outperforms Condorcet and Individual Rank Learning Methods"</a> |
28 | 156 | * @since 2024-09-29 |
29 | 157 | */ |
30 | 158 | public class RrfPostProcessor implements DocumentPostProcessor { |
31 | 159 | private static final int DEFAULT_FACTOR = 60; |
32 | 160 |
|
33 | | - private static final Map<RrfScoreStrategyEnum, Function<DoubleStream, OptionalDouble>> SCORE_STRATEGY_MAP = |
34 | | - MapBuilder.<RrfScoreStrategyEnum, Function<DoubleStream, OptionalDouble>>get() |
35 | | - .put(RrfScoreStrategyEnum.MAX, DoubleStream::max) |
36 | | - .put(RrfScoreStrategyEnum.AVG, DoubleStream::average) |
| 161 | + private static final Map<RrfScoreStrategy, Function<DoubleStream, OptionalDouble>> SCORE_STRATEGY_MAP = |
| 162 | + MapBuilder.<RrfScoreStrategy, Function<DoubleStream, OptionalDouble>>get() |
| 163 | + .put(RrfScoreStrategy.MAX, DoubleStream::max) |
| 164 | + .put(RrfScoreStrategy.AVG, DoubleStream::average) |
37 | 165 | .build(); |
38 | 166 |
|
39 | | - private final RrfScoreStrategyEnum scoreStrategy; |
| 167 | + private final RrfScoreStrategy scoreStrategy; |
40 | 168 | private final int factor; |
41 | 169 |
|
| 170 | + /** |
| 171 | + * 创建一个默认的 RRF 后处理器。 |
| 172 | + */ |
42 | 173 | public RrfPostProcessor() { |
43 | | - this(RrfScoreStrategyEnum.MAX, DEFAULT_FACTOR); |
| 174 | + this(RrfScoreStrategy.MAX, DEFAULT_FACTOR); |
44 | 175 | } |
45 | 176 |
|
46 | | - public RrfPostProcessor(RrfScoreStrategyEnum scoreStrategy) { |
| 177 | + /** |
| 178 | + * 创建一个指定 RRF 策略的 RRF 后处理器。 |
| 179 | + * |
| 180 | + * @param scoreStrategy 指定的 RRF 策略的 {@link RrfScoreStrategy}。 |
| 181 | + */ |
| 182 | + public RrfPostProcessor(RrfScoreStrategy scoreStrategy) { |
47 | 183 | this(scoreStrategy, DEFAULT_FACTOR); |
48 | 184 | } |
49 | 185 |
|
50 | | - public RrfPostProcessor(RrfScoreStrategyEnum scoreStrategy, int factor) { |
| 186 | + /** |
| 187 | + * 创建一个指定 RRF 策略和因子的 RRF 后处理器。 |
| 188 | + * |
| 189 | + * @param scoreStrategy 指定的 RRF 策略的 {@link RrfScoreStrategy}。 |
| 190 | + * @param factor 指定的 RRF 策略的因子的 {@code int}。 |
| 191 | + */ |
| 192 | + public RrfPostProcessor(RrfScoreStrategy scoreStrategy, int factor) { |
51 | 193 | this.scoreStrategy = Validation.notNull(scoreStrategy, "The score strategy cannot be null."); |
52 | 194 | this.factor = Validation.greaterThanOrEquals(factor, 0, "The factor must be non-negative."); |
53 | 195 | if (!SCORE_STRATEGY_MAP.containsKey(this.scoreStrategy)) { |
|
0 commit comments