Support knn filter on nested metadata

mayya-sharipova · mayya-sharipova · commit 76585580094a · 2025-07-29T14:04:52.000-04:00
This is to update our documentation that we now can support knn filter on nested metadata. Related to elastic/elasticsearch#113949
diff --git a/solutions/search/vector/knn.md b/solutions/search/vector/knn.md
@@ -575,6 +575,20 @@ PUT passage_vectors
                     "text": {
                         "type": "text",
                         "index": false
+                    },
+                    "language": {
+                        "type": "keyword"
+                    }
+                }
+            },
+            "metadata": {
+                "type": "nested",
+                "properties": {
+                    "key": {
+                        "type": "keyword"
+                    },
+                    "value": {
+                        "type": "text"
                     }
                 }
             }
@@ -588,9 +602,9 @@ With the above mapping, we can index multiple passage vectors along with storing
 ```console
 POST passage_vectors/_bulk?refresh=true
 { "index": { "_id": "1" } }
-{ "full_text": "first paragraph another paragraph", "creation_time": "2019-05-04", "paragraph": [ { "vector": [ 0.45, 45 ], "text": "first paragraph", "paragraph_id": "1" }, { "vector": [ 0.8, 0.6 ], "text": "another paragraph", "paragraph_id": "2" } ] }
+{ "full_text": "first paragraph another paragraph", "creation_time": "2019-05-04", "paragraph": [ { "vector": [ 0.45, 45 ], "text": "first paragraph", "paragraph_id": "1", "language": "EN" }, { "vector": [ 0.8, 0.6 ], "text": "another paragraph", "paragraph_id": "2", "language": "FR" } ], "metadata": [ { "key": "author", "value": "Jane Doe" }, { "key": "source", "value": "Internal Memo" } ] }
 { "index": { "_id": "2" } }
-{ "full_text": "number one paragraph number two paragraph", "creation_time": "2020-05-04", "paragraph": [ { "vector": [ 1.2, 4.5 ], "text": "number one paragraph", "paragraph_id": "1" }, { "vector": [ -1, 42 ], "text": "number two paragraph", "paragraph_id": "2" } ] }
+{ "full_text": "number one paragraph number two paragraph", "creation_time": "2020-05-04", "paragraph": [ { "vector": [ 1.2, 4.5 ], "text": "number one paragraph", "paragraph_id": "1", "language": "EN" }, { "vector": [ -1, 42 ], "text": "number two paragraph", "paragraph_id": "2", "language": "EN" }] , "metadata": [ { "key": "author", "value": "Jane Austen" }, { "key": "source", "value": "Financial" } ] }
 ```
 
 The query will seem very similar to a typical kNN search:
@@ -606,8 +620,7 @@ POST passage_vectors/_search
             45
         ],
         "field": "paragraph.vector",
-        "k": 2,
-        "num_candidates": 2
+        "k": 2
     }
 }
 ```
@@ -662,10 +675,14 @@ Note below that even though we have 4 total vectors, we still return two documen
 }
 ```
 
-What if you wanted to filter by some top-level document metadata? You can do this by adding `filter` to your `knn` clause.
+#### Filtering in nested KNN search [nested-knn-search-filtering]
+What if you wanted to filter by some top-level or nested metadata? You can do this by adding `filter` to your `knn` clause.
 
 ::::{note}
-`filter` will always be over the top-level document metadata. This means you cannot filter based on `nested` field metadata.
+To ensure correct results: each individual filter must be either over
+the top-level metadata or `nested` metadata. However, a single knn search
+supports multiple filters, where some filters can be over the top-level
+metadata and some over nested.
 ::::
 
 
@@ -678,26 +695,16 @@ POST passage_vectors/_search
     ],
     "_source": false,
     "knn": {
-        "query_vector": [
-            0.45,
-            45
-        ],
+        "query_vector": [0.45, 45],
         "field": "paragraph.vector",
         "k": 2,
-        "num_candidates": 2,
         "filter": {
-            "bool": {
-                "filter": [
-                    {
-                        "range": {
-                            "creation_time": {
-                                "gte": "2019-05-01",
-                                "lte": "2019-05-05"
-                            }
-                        }
-                    }
-                ]
-            }
+          "range": {
+              "creation_time": {
+                  "gte": "2019-05-01",
+                  "lte": "2019-05-05"
+              }
+          }
         }
     }
 }
@@ -740,6 +747,87 @@ Now we have filtered based on the top level `"creation_time"` and only one docum
 }
 ```
 
+Below we filter on nested metadata.
+When scoring parents' documents this query only considers vectors that
+have "paragraph.language" set to "EN".
+
+```console
+POST passage_vectors/_search
+{
+    "fields": [
+        "full_text"
+    ],
+    "_source": false,
+    "knn": {
+        "query_vector": [0.45, 45],
+        "field": "paragraph.vector",
+        "k": 2,
+        "filter": {
+          "match": {
+            "paragraph.language": "EN"
+          }
+        }
+    }
+}
+```
+
+Below we have two filters: one over nested metadata
+and another over the top level metadata. For scoring parents' documents,
+this query only considers vectors that have "paragraph.language" set to "EN" 
+and  whose parent's have creation time within the request range.
+
+```console
+POST passage_vectors/_search
+{
+    "fields": [
+        "full_text"
+    ],
+    "_source": false,
+    "knn": {
+        "query_vector": [0.45,45],
+        "field": "paragraph.vector",
+        "k": 2,
+        "filter": [
+            {"match": {"paragraph.language": "EN"}},
+            {"range": { "creation_time": { "gte": "2019-05-01", "lte": "2019-05-05"}}}
+        ]
+    }
+}
+```
+
+#### Filtering by sibling nested fields in nested KNN search [nested-knn-search-filtering-sibling]
+Nested knn search also allows to do pre-filtering on sibling nested fields.
+For example, given "paragraphs" and "metadata" as nested fields, we can search "paragraphs.vector" and filter by "metadata.key" and "metadata.value".
+
+```console
+POST passage_vectors/_search
+{
+    "fields": [
+        "full_text"
+    ],
+    "_source": false,
+    "knn": {
+        "query_vector": [0.45, 45],
+        "field": "paragraph.vector",
+        "k": 2,
+        "filter": {
+            "nested": {
+                "path": "metadata",
+                "query": {
+                    "bool": {
+                        "must": [
+                            { "match": { "metadata.key": "author" } },
+                            { "match": { "metadata.value": "Doe" } }
+                        ]
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+Retrieving "inner_hits" when filtering on sibling nested fiels is not supported.
 
 ### Nested kNN Search with Inner hits [nested-knn-search-inner-hits]