diff --git a/docs/changelog/128504.yaml b/docs/changelog/128504.yaml new file mode 100644 index 0000000000000..7b451d27520aa --- /dev/null +++ b/docs/changelog/128504.yaml @@ -0,0 +1,5 @@ +pr: 128504 +summary: Add l2_norm normalization support to linear retriever +area: Relevance +type: enhancement +issues: [] diff --git a/docs/reference/search/search-your-data/retrievers-overview.asciidoc b/docs/reference/search/search-your-data/retrievers-overview.asciidoc index 1a94ae18a5c20..b0f0a98793805 100644 --- a/docs/reference/search/search-your-data/retrievers-overview.asciidoc +++ b/docs/reference/search/search-your-data/retrievers-overview.asciidoc @@ -26,6 +26,40 @@ Returns top documents from a <>, in the context of a * <>. Combines the top results from multiple sub-retrievers using a weighted sum of their scores. Allows to specify different weights for each retriever, as well as independently normalize the scores from each result set. + + [discrete] + [[retrievers-overview-linear-retriever-parameters]] +==== Linear Retriever Parameters + +`retrievers` +: (Required, array of objects) + A list of the sub-retrievers' configuration, that we will take into account and whose result sets we will merge through a weighted sum. Each configuration can have a different weight and normalization depending on the specified retriever. + +Each entry specifies the following parameters: + +`retriever` +: (Required, a `retriever` object) + Specifies the retriever for which we will compute the top documents for. The retriever will produce `rank_window_size` results, which will later be merged based on the specified `weight` and `normalizer`. + +`weight` +: (Optional, float) + The weight that each score of this retriever’s top docs will be multiplied with. Must be greater or equal to 0. Defaults to 1.0. + +`normalizer` +: (Optional, String) + Specifies how we will normalize the retriever’s scores, before applying the specified `weight`. Available values are: `minmax`, `l2_norm`, and `none`. Defaults to `none`. + + * `none` + * `minmax` : A `MinMaxScoreNormalizer` that normalizes scores based on the following formula + + ``` + score = (score - min) / (max - min) + ``` + + * `l2_norm` : An `L2ScoreNormalizer` that normalizes scores using the L2 norm of the score values. + +See also the hybrid search example for how to independently configure and apply normalizers to retrievers. + * <>. Combines and ranks multiple first-stage retrievers using the reciprocal rank fusion (RRF) algorithm. Allows you to combine multiple result sets with different relevance indicators into a single result set. diff --git a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/RankRRFFeatures.java b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/RankRRFFeatures.java index ef52dcd33ca78..7952a7f8d7da1 100644 --- a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/RankRRFFeatures.java +++ b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/RankRRFFeatures.java @@ -14,6 +14,7 @@ import java.util.Set; import static org.elasticsearch.search.retriever.CompoundRetrieverBuilder.INNER_RETRIEVERS_FILTER_SUPPORT; +import static org.elasticsearch.xpack.rank.linear.L2ScoreNormalizer.LINEAR_RETRIEVER_L2_NORM; import static org.elasticsearch.xpack.rank.linear.MinMaxScoreNormalizer.LINEAR_RETRIEVER_MINMAX_SINGLE_DOC_FIX; import static org.elasticsearch.xpack.rank.rrf.RRFRetrieverBuilder.RRF_RETRIEVER_COMPOSITION_SUPPORTED; @@ -31,6 +32,6 @@ public Set getFeatures() { @Override public Set getTestFeatures() { - return Set.of(INNER_RETRIEVERS_FILTER_SUPPORT, LINEAR_RETRIEVER_MINMAX_SINGLE_DOC_FIX); + return Set.of(INNER_RETRIEVERS_FILTER_SUPPORT, LINEAR_RETRIEVER_MINMAX_SINGLE_DOC_FIX, LINEAR_RETRIEVER_L2_NORM); } } diff --git a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/linear/L2ScoreNormalizer.java b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/linear/L2ScoreNormalizer.java new file mode 100644 index 0000000000000..34c0a48f15a0d --- /dev/null +++ b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/linear/L2ScoreNormalizer.java @@ -0,0 +1,66 @@ + +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.rank.linear; + +import org.apache.lucene.search.ScoreDoc; +import org.elasticsearch.features.NodeFeature; + +/** + * A score normalizer that applies L2 normalization to a set of scores. + *

+ * Each score is divided by the L2 norm of the scores if the norm is greater than a small EPSILON. + * If all scores are zero or NaN, normalization is skipped and the original scores are returned. + *

+ */ +public class L2ScoreNormalizer extends ScoreNormalizer { + + public static final L2ScoreNormalizer INSTANCE = new L2ScoreNormalizer(); + + public static final String NAME = "l2_norm"; + + private static final float EPSILON = 1e-6f; + + public static final NodeFeature LINEAR_RETRIEVER_L2_NORM = new NodeFeature("linear_retriever.l2_norm"); + + public L2ScoreNormalizer() {} + + @Override + public String getName() { + return NAME; + } + + @Override + public ScoreDoc[] normalizeScores(ScoreDoc[] docs) { + if (docs.length == 0) { + return docs; + } + double sumOfSquares = 0.0; + boolean atLeastOneValidScore = false; + for (ScoreDoc doc : docs) { + if (Float.isNaN(doc.score) == false) { + atLeastOneValidScore = true; + sumOfSquares += doc.score * doc.score; + } + } + if (atLeastOneValidScore == false) { + // No valid scores to normalize + return docs; + } + double norm = Math.sqrt(sumOfSquares); + if (norm < EPSILON) { + return docs; + } + ScoreDoc[] scoreDocs = new ScoreDoc[docs.length]; + for (int i = 0; i < docs.length; i++) { + float score = (float) (docs[i].score / norm); + scoreDocs[i] = new ScoreDoc(docs[i].doc, score, docs[i].shardIndex); + } + return scoreDocs; + } +} diff --git a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/linear/ScoreNormalizer.java b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/linear/ScoreNormalizer.java index 48334b9adf957..34b7277498218 100644 --- a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/linear/ScoreNormalizer.java +++ b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/linear/ScoreNormalizer.java @@ -17,6 +17,9 @@ public abstract class ScoreNormalizer { public static ScoreNormalizer valueOf(String normalizer) { if (MinMaxScoreNormalizer.NAME.equalsIgnoreCase(normalizer)) { return MinMaxScoreNormalizer.INSTANCE; + } else if (L2ScoreNormalizer.NAME.equalsIgnoreCase(normalizer)) { + return L2ScoreNormalizer.INSTANCE; + } else if (IdentityScoreNormalizer.NAME.equalsIgnoreCase(normalizer)) { return IdentityScoreNormalizer.INSTANCE; diff --git a/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/linear/L2ScoreNormalizerTests.java b/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/linear/L2ScoreNormalizerTests.java new file mode 100644 index 0000000000000..ad8906ac89ae1 --- /dev/null +++ b/x-pack/plugin/rank-rrf/src/test/java/org/elasticsearch/xpack/rank/linear/L2ScoreNormalizerTests.java @@ -0,0 +1,54 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.rank.linear; + +import org.apache.lucene.search.ScoreDoc; +import org.elasticsearch.test.ESTestCase; + +public class L2ScoreNormalizerTests extends ESTestCase { + + public void testNormalizeTypicalVector() { + ScoreDoc[] docs = { new ScoreDoc(1, 3.0f, 0), new ScoreDoc(2, 4.0f, 0) }; + ScoreDoc[] normalized = L2ScoreNormalizer.INSTANCE.normalizeScores(docs); + assertEquals(0.6f, normalized[0].score, 1e-5); + assertEquals(0.8f, normalized[1].score, 1e-5); + } + + public void testAllZeros() { + ScoreDoc[] docs = { new ScoreDoc(1, 0.0f, 0), new ScoreDoc(2, 0.0f, 0) }; + ScoreDoc[] normalized = L2ScoreNormalizer.INSTANCE.normalizeScores(docs); + assertEquals(0.0f, normalized[0].score, 0.0f); + assertEquals(0.0f, normalized[1].score, 0.0f); + } + + public void testAllNaN() { + ScoreDoc[] docs = { new ScoreDoc(1, Float.NaN, 0), new ScoreDoc(2, Float.NaN, 0) }; + ScoreDoc[] normalized = L2ScoreNormalizer.INSTANCE.normalizeScores(docs); + assertTrue(Float.isNaN(normalized[0].score)); + assertTrue(Float.isNaN(normalized[1].score)); + } + + public void testMixedZeroAndNaN() { + ScoreDoc[] docs = { new ScoreDoc(1, 0.0f, 0), new ScoreDoc(2, Float.NaN, 0) }; + ScoreDoc[] normalized = L2ScoreNormalizer.INSTANCE.normalizeScores(docs); + assertEquals(0.0f, normalized[0].score, 0.0f); + assertTrue(Float.isNaN(normalized[1].score)); + } + + public void testSingleElement() { + ScoreDoc[] docs = { new ScoreDoc(1, 42.0f, 0) }; + ScoreDoc[] normalized = L2ScoreNormalizer.INSTANCE.normalizeScores(docs); + assertEquals(1.0f, normalized[0].score, 1e-5); + } + + public void testEmptyArray() { + ScoreDoc[] docs = {}; + ScoreDoc[] normalized = L2ScoreNormalizer.INSTANCE.normalizeScores(docs); + assertEquals(0, normalized.length); + } +} diff --git a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/linear/10_linear_retriever.yml b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/linear/10_linear_retriever.yml index 52ab532462e44..a6f8c580aa32d 100644 --- a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/linear/10_linear_retriever.yml +++ b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/linear/10_linear_retriever.yml @@ -265,6 +265,99 @@ setup: - match: { hits.hits.3._id: "3" } - close_to: { hits.hits.3._score: { value: 0.0, error: 0.001 } } +--- +"should normalize initial scores with l2_norm": + - requires: + cluster_features: [ "linear_retriever.l2_norm" ] + reason: "Requires l2_norm normalization support in linear retriever" + - do: + search: + index: test + body: + retriever: + linear: + retrievers: [ + { + retriever: { + standard: { + query: { + bool: { + should: [ + { constant_score: { filter: { term: { keyword: { value: "one" } } }, boost: 3.0 } }, + { constant_score: { filter: { term: { keyword: { value: "two" } } }, boost: 4.0 } } + ] + } + } + } + }, + weight: 10.0, + normalizer: "l2_norm" + }, + { + retriever: { + standard: { + query: { + bool: { + should: [ + { constant_score: { filter: { term: { keyword: { value: "three" } } }, boost: 6.0 } }, + { constant_score: { filter: { term: { keyword: { value: "four" } } }, boost: 8.0 } } + ] + } + } + } + }, + weight: 2.0, + normalizer: "l2_norm" + } + ] + + - match: { hits.total.value: 4 } + - match: { hits.hits.0._id: "2" } + - match: { hits.hits.0._score: 8.0 } + - match: { hits.hits.1._id: "1" } + - match: { hits.hits.1._score: 6.0 } + - match: { hits.hits.2._id: "4" } + - close_to: { hits.hits.2._score: { value: 1.6, error: 0.001 } } + - match: { hits.hits.3._id: "3" } + - close_to: { hits.hits.3._score: { value: 1.2, error: 0.001 } } + +--- +"should handle all zero scores in normalization": + - requires: + cluster_features: [ "linear_retriever.l2_norm" ] + reason: "Requires l2_norm normalization support in linear retriever" + - do: + search: + index: test + body: + retriever: + linear: + retrievers: [ + { + retriever: { + standard: { + query: { + bool: { + should: [ + { constant_score: { filter: { term: { keyword: { value: "one" } } }, boost: 0.0 } }, + { constant_score: { filter: { term: { keyword: { value: "two" } } }, boost: 0.0 } }, + { constant_score: { filter: { term: { keyword: { value: "three" } } }, boost: 0.0 } }, + { constant_score: { filter: { term: { keyword: { value: "four" } } }, boost: 0.0 } } + ] + } + } + } + }, + weight: 1.0, + normalizer: "l2_norm" + } + ] + - match: { hits.total.value: 4 } + - close_to: { hits.hits.0._score: { value: 0.0, error: 0.0001 } } + - close_to: { hits.hits.1._score: { value: 0.0, error: 0.0001 } } + - close_to: { hits.hits.2._score: { value: 0.0, error: 0.0001 } } + - close_to: { hits.hits.3._score: { value: 0.0, error: 0.0001 } } + --- "should throw on unknown normalizer": - do: