From 76585580094a8d10726fe1217cc446aad17d599b Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Tue, 29 Jul 2025 14:04:52 -0400 Subject: [PATCH 1/6] Support knn filter on nested metadata This is to update our documentation that we now can support knn filter on nested metadata. Related to https://github.com/elastic/elasticsearch/pull/113949 --- solutions/search/vector/knn.md | 134 +++++++++++++++++++++++++++------ 1 file changed, 111 insertions(+), 23 deletions(-) diff --git a/solutions/search/vector/knn.md b/solutions/search/vector/knn.md index 9357258517..e92f270334 100644 --- a/solutions/search/vector/knn.md +++ b/solutions/search/vector/knn.md @@ -575,6 +575,20 @@ PUT passage_vectors "text": { "type": "text", "index": false + }, + "language": { + "type": "keyword" + } + } + }, + "metadata": { + "type": "nested", + "properties": { + "key": { + "type": "keyword" + }, + "value": { + "type": "text" } } } @@ -588,9 +602,9 @@ With the above mapping, we can index multiple passage vectors along with storing ```console POST passage_vectors/_bulk?refresh=true { "index": { "_id": "1" } } -{ "full_text": "first paragraph another paragraph", "creation_time": "2019-05-04", "paragraph": [ { "vector": [ 0.45, 45 ], "text": "first paragraph", "paragraph_id": "1" }, { "vector": [ 0.8, 0.6 ], "text": "another paragraph", "paragraph_id": "2" } ] } +{ "full_text": "first paragraph another paragraph", "creation_time": "2019-05-04", "paragraph": [ { "vector": [ 0.45, 45 ], "text": "first paragraph", "paragraph_id": "1", "language": "EN" }, { "vector": [ 0.8, 0.6 ], "text": "another paragraph", "paragraph_id": "2", "language": "FR" } ], "metadata": [ { "key": "author", "value": "Jane Doe" }, { "key": "source", "value": "Internal Memo" } ] } { "index": { "_id": "2" } } -{ "full_text": "number one paragraph number two paragraph", "creation_time": "2020-05-04", "paragraph": [ { "vector": [ 1.2, 4.5 ], "text": "number one paragraph", "paragraph_id": "1" }, { "vector": [ -1, 42 ], "text": "number two paragraph", "paragraph_id": "2" } ] } +{ "full_text": "number one paragraph number two paragraph", "creation_time": "2020-05-04", "paragraph": [ { "vector": [ 1.2, 4.5 ], "text": "number one paragraph", "paragraph_id": "1", "language": "EN" }, { "vector": [ -1, 42 ], "text": "number two paragraph", "paragraph_id": "2", "language": "EN" }] , "metadata": [ { "key": "author", "value": "Jane Austen" }, { "key": "source", "value": "Financial" } ] } ``` The query will seem very similar to a typical kNN search: @@ -606,8 +620,7 @@ POST passage_vectors/_search 45 ], "field": "paragraph.vector", - "k": 2, - "num_candidates": 2 + "k": 2 } } ``` @@ -662,10 +675,14 @@ Note below that even though we have 4 total vectors, we still return two documen } ``` -What if you wanted to filter by some top-level document metadata? You can do this by adding `filter` to your `knn` clause. +#### Filtering in nested KNN search [nested-knn-search-filtering] +What if you wanted to filter by some top-level or nested metadata? You can do this by adding `filter` to your `knn` clause. ::::{note} -`filter` will always be over the top-level document metadata. This means you cannot filter based on `nested` field metadata. +To ensure correct results: each individual filter must be either over +the top-level metadata or `nested` metadata. However, a single knn search +supports multiple filters, where some filters can be over the top-level +metadata and some over nested. :::: @@ -678,26 +695,16 @@ POST passage_vectors/_search ], "_source": false, "knn": { - "query_vector": [ - 0.45, - 45 - ], + "query_vector": [0.45, 45], "field": "paragraph.vector", "k": 2, - "num_candidates": 2, "filter": { - "bool": { - "filter": [ - { - "range": { - "creation_time": { - "gte": "2019-05-01", - "lte": "2019-05-05" - } - } - } - ] - } + "range": { + "creation_time": { + "gte": "2019-05-01", + "lte": "2019-05-05" + } + } } } } @@ -740,6 +747,87 @@ Now we have filtered based on the top level `"creation_time"` and only one docum } ``` +Below we filter on nested metadata. +When scoring parents' documents this query only considers vectors that +have "paragraph.language" set to "EN". + +```console +POST passage_vectors/_search +{ + "fields": [ + "full_text" + ], + "_source": false, + "knn": { + "query_vector": [0.45, 45], + "field": "paragraph.vector", + "k": 2, + "filter": { + "match": { + "paragraph.language": "EN" + } + } + } +} +``` + +Below we have two filters: one over nested metadata +and another over the top level metadata. For scoring parents' documents, +this query only considers vectors that have "paragraph.language" set to "EN" +and whose parent's have creation time within the request range. + +```console +POST passage_vectors/_search +{ + "fields": [ + "full_text" + ], + "_source": false, + "knn": { + "query_vector": [0.45,45], + "field": "paragraph.vector", + "k": 2, + "filter": [ + {"match": {"paragraph.language": "EN"}}, + {"range": { "creation_time": { "gte": "2019-05-01", "lte": "2019-05-05"}}} + ] + } +} +``` + +#### Filtering by sibling nested fields in nested KNN search [nested-knn-search-filtering-sibling] +Nested knn search also allows to do pre-filtering on sibling nested fields. +For example, given "paragraphs" and "metadata" as nested fields, we can search "paragraphs.vector" and filter by "metadata.key" and "metadata.value". + +```console +POST passage_vectors/_search +{ + "fields": [ + "full_text" + ], + "_source": false, + "knn": { + "query_vector": [0.45, 45], + "field": "paragraph.vector", + "k": 2, + "filter": { + "nested": { + "path": "metadata", + "query": { + "bool": { + "must": [ + { "match": { "metadata.key": "author" } }, + { "match": { "metadata.value": "Doe" } } + ] + } + } + } + } + } +} +``` + +Retrieving "inner_hits" when filtering on sibling nested fiels is not supported. ### Nested kNN Search with Inner hits [nested-knn-search-inner-hits] From 63a2c7b2e64c39e7524ebe62050c32a0815a1524 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 30 Jul 2025 09:52:27 -0400 Subject: [PATCH 2/6] Add applies_to from 9.2 --- solutions/search/vector/knn.md | 44 +++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/solutions/search/vector/knn.md b/solutions/search/vector/knn.md index e92f270334..98a3b5bc75 100644 --- a/solutions/search/vector/knn.md +++ b/solutions/search/vector/knn.md @@ -676,14 +676,15 @@ Note below that even though we have 4 total vectors, we still return two documen ``` #### Filtering in nested KNN search [nested-knn-search-filtering] -What if you wanted to filter by some top-level or nested metadata? You can do this by adding `filter` to your `knn` clause. +Want to filter by metadata? You can do this by adding `filter` to your `knn` clause. -::::{note} -To ensure correct results: each individual filter must be either over -the top-level metadata or `nested` metadata. However, a single knn search -supports multiple filters, where some filters can be over the top-level -metadata and some over nested. -:::: +To ensure correct results, each individual filter must be either over: + +- Top-level metadata +- `nested` metadata {applies_to}`stack: ga 9.2` + :::{note} + A single `knn` search supports multiple filters, where some filters can be over the top-level metadata and some over nested. + ::: ```console @@ -699,12 +700,12 @@ POST passage_vectors/_search "field": "paragraph.vector", "k": 2, "filter": { - "range": { - "creation_time": { - "gte": "2019-05-01", - "lte": "2019-05-05" - } - } + "range": { + "creation_time": { + "gte": "2019-05-01", + "lte": "2019-05-05" + } + } } } } @@ -747,6 +748,11 @@ Now we have filtered based on the top level `"creation_time"` and only one docum } ``` +##### Filtering on nested metadata [nested-knn-search-filtering-nested-metatadata] +```{applies_to} +stack: ga 9.2 +``` + Below we filter on nested metadata. When scoring parents' documents this query only considers vectors that have "paragraph.language" set to "EN". @@ -763,9 +769,9 @@ POST passage_vectors/_search "field": "paragraph.vector", "k": 2, "filter": { - "match": { - "paragraph.language": "EN" - } + "match": { + "paragraph.language": "EN" + } } } } @@ -774,7 +780,7 @@ POST passage_vectors/_search Below we have two filters: one over nested metadata and another over the top level metadata. For scoring parents' documents, this query only considers vectors that have "paragraph.language" set to "EN" -and whose parent's have creation time within the request range. +and whose parents have creation time within the request range. ```console POST passage_vectors/_search @@ -796,6 +802,10 @@ POST passage_vectors/_search ``` #### Filtering by sibling nested fields in nested KNN search [nested-knn-search-filtering-sibling] +```{applies_to} +stack: ga 9.2 +``` + Nested knn search also allows to do pre-filtering on sibling nested fields. For example, given "paragraphs" and "metadata" as nested fields, we can search "paragraphs.vector" and filter by "metadata.key" and "metadata.value". From 86efff0a98a524d1f988143d84b9a5c87dac0288 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 30 Jul 2025 11:36:58 -0400 Subject: [PATCH 3/6] Update solutions/search/vector/knn.md Co-authored-by: Liam Thompson --- solutions/search/vector/knn.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/solutions/search/vector/knn.md b/solutions/search/vector/knn.md index 98a3b5bc75..c6f0dc7c0d 100644 --- a/solutions/search/vector/knn.md +++ b/solutions/search/vector/knn.md @@ -753,8 +753,8 @@ Now we have filtered based on the top level `"creation_time"` and only one docum stack: ga 9.2 ``` -Below we filter on nested metadata. -When scoring parents' documents this query only considers vectors that +The following query filters on nested metadata. +When scoring parent documents, this query only considers vectors that have "paragraph.language" set to "EN". ```console From 42c85d6f676656c2abe85e45d83b7face68c878a Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 30 Jul 2025 11:37:28 -0400 Subject: [PATCH 4/6] Update solutions/search/vector/knn.md Co-authored-by: Liam Thompson --- solutions/search/vector/knn.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/solutions/search/vector/knn.md b/solutions/search/vector/knn.md index c6f0dc7c0d..a8ab6aac02 100644 --- a/solutions/search/vector/knn.md +++ b/solutions/search/vector/knn.md @@ -777,10 +777,10 @@ POST passage_vectors/_search } ``` -Below we have two filters: one over nested metadata -and another over the top level metadata. For scoring parents' documents, -this query only considers vectors that have "paragraph.language" set to "EN" -and whose parents have creation time within the request range. +The following query has two filters: one over nested metadata and +another over the top-level metadata. When scoring parent documents, this +query only considers vectors that have "paragraph.language" set to "EN" +and whose parent documents were created within the specified range. ```console POST passage_vectors/_search From 57a02b7ff862bef21742485cec6f9909221fe689 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 30 Jul 2025 11:37:43 -0400 Subject: [PATCH 5/6] Update solutions/search/vector/knn.md Co-authored-by: Liam Thompson --- solutions/search/vector/knn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solutions/search/vector/knn.md b/solutions/search/vector/knn.md index a8ab6aac02..0e7611dc62 100644 --- a/solutions/search/vector/knn.md +++ b/solutions/search/vector/knn.md @@ -806,7 +806,7 @@ POST passage_vectors/_search stack: ga 9.2 ``` -Nested knn search also allows to do pre-filtering on sibling nested fields. +Nested knn search also allows pre-filtering on sibling nested fields. For example, given "paragraphs" and "metadata" as nested fields, we can search "paragraphs.vector" and filter by "metadata.key" and "metadata.value". ```console From f346f4a25ce48f3856079beb920093641c66374f Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 30 Jul 2025 11:38:00 -0400 Subject: [PATCH 6/6] Update solutions/search/vector/knn.md Co-authored-by: Liam Thompson --- solutions/search/vector/knn.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/solutions/search/vector/knn.md b/solutions/search/vector/knn.md index 0e7611dc62..ca3c5cc498 100644 --- a/solutions/search/vector/knn.md +++ b/solutions/search/vector/knn.md @@ -837,7 +837,9 @@ POST passage_vectors/_search } ``` -Retrieving "inner_hits" when filtering on sibling nested fiels is not supported. +:::{note} +Retrieving "inner_hits" when filtering on sibling nested fields is not supported. +::: ### Nested kNN Search with Inner hits [nested-knn-search-inner-hits]