WIP

mridula-s109 · mridula-s109 · commit b6d5109247fb · 2025-07-25T16:50:23.000+01:00
diff --git a/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/linear/LinearRetrieverBuilder.java b/x-pack/plugin/rank-rrf/src/main/java/org/elasticsearch/xpack/rank/linear/LinearRetrieverBuilder.java
@@ -123,6 +123,34 @@ private static ScoreNormalizer[] getDefaultNormalizers(List<RetrieverSource> inn
         return normalizers;
     }
 
+    private void normalizeNormalizerArray(ScoreNormalizer topLevelNormalizer, ScoreNormalizer[] normalizers) {
+        for (int i = 0; i < normalizers.length; i++) {
+            ScoreNormalizer current = normalizers[i];
+            
+            if (topLevelNormalizer != null) {
+                // Validate explicit per-retriever normalizers match top-level
+                if (current != null && !current.equals(DEFAULT_NORMALIZER) && !current.equals(topLevelNormalizer)) {
+                    throw new IllegalArgumentException(
+                        String.format(
+                            "[%s] All per-retriever normalizers must match the top-level normalizer: "
+                            + "expected [%s], found [%s] in retriever [%d]",
+                            NAME, topLevelNormalizer.getName(), current.getName(), i
+                        )
+                    );
+                }
+                // Propagate top-level normalizer to unspecified positions
+                if (current == null || current.equals(DEFAULT_NORMALIZER)) {
+                    normalizers[i] = topLevelNormalizer;
+                }
+            } else {
+                // No top-level normalizer: ensure null values become DEFAULT_NORMALIZER
+                if (current == null) {
+                    normalizers[i] = DEFAULT_NORMALIZER;
+                }
+            }
+        }
+    }
+
     public static LinearRetrieverBuilder fromXContent(XContentParser parser, RetrieverParserContext context) throws IOException {
         if (context.clusterSupportsFeature(LINEAR_RETRIEVER_SUPPORTED) == false) {
             throw new ParsingException(parser.getTokenLocation(), "unknown retriever [" + NAME + "]");
@@ -185,32 +213,7 @@ public LinearRetrieverBuilder(
         this.query = query;
         this.normalizer = normalizer;
 
-        if (normalizer != null) {
-            // First pass: validate that any specified per-retriever normalizers match the top-level one
-            for (int i = 0; i < normalizers.length; i++) {
-                ScoreNormalizer subNormalizer = normalizers[i];
-                if (subNormalizer != null && !subNormalizer.equals(DEFAULT_NORMALIZER) && !subNormalizer.equals(normalizer)) {
-                    throw new IllegalArgumentException(
-                        "["
-                            + NAME
-                            + "] All per-retriever normalizers must match the top-level normalizer: "
-                            + "expected ["
-                            + normalizer.getName()
-                            + "], found ["
-                            + subNormalizer.getName()
-                            + "] in retriever ["
-                            + i
-                            + "]"
-                    );
-                }
-            }
-            // Second pass: propagate top-level normalizer to any unspecified positions
-            for (int i = 0; i < normalizers.length; i++) {
-                if (normalizers[i] == null || normalizers[i].equals(DEFAULT_NORMALIZER)) {
-                    normalizers[i] = normalizer;
-                }
-            }
-        }
+        normalizeNormalizerArray(normalizer, normalizers);
 
     }
 
diff --git a/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/linear/10_linear_retriever_normalizers.yml b/x-pack/plugin/rank-rrf/src/yamlRestTest/resources/rest-api-spec/test/linear/10_linear_retriever_normalizers.yml
@@ -0,0 +1,247 @@
+setup:
+  - requires:
+      cluster_features: [ "linear_retriever_supported", "linear_retriever.l2_norm" ]
+      reason: "Support for linear retriever and L2 normalization"
+      test_runner_features: close_to
+
+  - do:
+      indices.create:
+        index: test
+        body:
+          mappings:
+            properties:
+              vector:
+                type: dense_vector
+                dims: 1
+                index: true
+                similarity: l2_norm
+                index_options:
+                  type: flat
+              keyword:
+                type: keyword
+              other_keyword:
+                type: keyword
+              timestamp:
+                type: date
+
+  - do:
+      bulk:
+        refresh: true
+        index: test
+        body:
+          - '{"index": {"_id": 1 }}'
+          - '{"vector": [1], "keyword": "one", "other_keyword": "other", "timestamp": "2021-01-01T00:00:00"}'
+          - '{"index": {"_id": 2 }}'
+          - '{"vector": [2], "keyword": "two", "timestamp": "2022-01-01T00:00:00"}'
+          - '{"index": {"_id": 3 }}'
+          - '{"vector": [3], "keyword": "three", "timestamp": "2023-01-01T00:00:00"}'
+          - '{"index": {"_id": 4 }}'
+          - '{"vector": [4], "keyword": "four", "other_keyword": "other", "timestamp": "2024-01-01T00:00:00"}'
+
+---
+"Linear retriever with top-level L2 normalization":
+  - do:
+      search:
+        index: test
+        body:
+          retriever:
+            linear:
+              normalizer: l2_norm
+              retrievers: [
+                {
+                  retriever: {
+                    standard: {
+                      query: {
+                        constant_score: {
+                          filter: {
+                            term: {
+                              keyword: {
+                                value: "one"
+                              }
+                            }
+                          },
+                          boost: 5.0
+                        }
+                      }
+                    }
+                  },
+                  weight: 1.0
+                },
+                {
+                  retriever: {
+                    standard: {
+                      query: {
+                        constant_score: {
+                          filter: {
+                            term: {
+                              keyword: {
+                                value: "four"
+                              }
+                            }
+                          },
+                          boost: 12.0
+                        }
+                      }
+                    }
+                  },
+                  weight: 1.0
+                }
+              ]
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._id: "4" }  # Doc 4 should rank higher with normalized scores
+  - match: { hits.hits.1._id: "1" }
+  # With L2 normalization: [5.0, 12.0] becomes [5.0/13.0, 12.0/13.0]
+  - close_to: { hits.hits.0._score: { value: 0.923, error: 0.01} }  # 12.0/13.0
+  - close_to: { hits.hits.1._score: { value: 0.385, error: 0.01} }  # 5.0/13.0
+
+---
+"Linear retriever with per-retriever L2 normalization":
+  - do:
+      search:
+        index: test
+        body:
+          retriever:
+            linear:
+              retrievers: [
+                {
+                  retriever: {
+                    standard: {
+                      query: {
+                        constant_score: {
+                          filter: {
+                            term: {
+                              keyword: {
+                                value: "one"
+                              }
+                            }
+                          },
+                          boost: 5.0
+                        }
+                      }
+                    }
+                  },
+                  weight: 1.0,
+                  normalizer: l2_norm
+                },
+                {
+                  retriever: {
+                    standard: {
+                      query: {
+                        constant_score: {
+                          filter: {
+                            term: {
+                              keyword: {
+                                value: "four"
+                              }
+                            }
+                          },
+                          boost: 12.0
+                        }
+                      }
+                    }
+                  },
+                  weight: 1.0,
+                  normalizer: l2_norm
+                }
+              ]
+
+  - match: { hits.total.value: 2 }
+  # With per-retriever L2 normalization, both scores would be normalized to 1.0
+  # So final score = 1.0 * weight1 + 1.0 * weight2 = 2.0 for each doc
+  # Then sorting is done by _doc (or some other tiebreaker)
+  - close_to: { hits.hits.0._score: { value: 1.0, error: 0.01} }
+  - close_to: { hits.hits.1._score: { value: 1.0, error: 0.01} }
+
+---
+"Linear retriever with mixed normalization (top-level and per-retriever with same normalizer)":
+  - do:
+      search:
+        index: test
+        body:
+          retriever:
+            linear:
+              normalizer: l2_norm
+              retrievers: [
+                {
+                  retriever: {
+                    standard: {
+                      query: {
+                        constant_score: {
+                          filter: {
+                            term: {
+                              keyword: {
+                                value: "one"
+                              }
+                            }
+                          },
+                          boost: 5.0
+                        }
+                      }
+                    }
+                  },
+                  weight: 1.0
+                },
+                {
+                  retriever: {
+                    standard: {
+                      query: {
+                        constant_score: {
+                          filter: {
+                            term: {
+                              keyword: {
+                                value: "four"
+                              }
+                            }
+                          },
+                          boost: 12.0
+                        }
+                      }
+                    }
+                  },
+                  weight: 1.0,
+                  normalizer: l2_norm
+                }
+              ]
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._id: "4" }
+  - match: { hits.hits.1._id: "1" }
+  # With L2 normalization: [5.0, 12.0] becomes [5.0/13.0, 12.0/13.0]
+  - close_to: { hits.hits.0._score: { value: 0.923, error: 0.01} }
+  - close_to: { hits.hits.1._score: { value: 0.385, error: 0.01} }
+
+---
+"Linear retriever with mismatched normalizers (should fail)":
+  - do:
+      catch: bad_request
+      search:
+        index: test
+        body:
+          retriever:
+            linear:
+              normalizer: l2_norm
+              retrievers: [
+                {
+                  retriever: {
+                    standard: {
+                      query: {
+                        match_all: {}
+                      }
+                    }
+                  }
+                },
+                {
+                  retriever: {
+                    standard: {
+                      query: {
+                        match_all: {}
+                      }
+                    }
+                  },
+                  normalizer: minmax
+                }
+              ]
+
+  - match: { error.root_cause.0.type: "illegal_argument_exception" }
+  - match: { error.root_cause.0.reason: /.*All per-retriever normalizers must match the top-level normalizer.*/ }