Skip to content

Commit 7658558

Browse files
Support knn filter on nested metadata
This is to update our documentation that we now can support knn filter on nested metadata. Related to elastic/elasticsearch#113949
1 parent 16ef38e commit 7658558

File tree

1 file changed

+111
-23
lines changed

1 file changed

+111
-23
lines changed

solutions/search/vector/knn.md

Lines changed: 111 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,20 @@ PUT passage_vectors
575575
"text": {
576576
"type": "text",
577577
"index": false
578+
},
579+
"language": {
580+
"type": "keyword"
581+
}
582+
}
583+
},
584+
"metadata": {
585+
"type": "nested",
586+
"properties": {
587+
"key": {
588+
"type": "keyword"
589+
},
590+
"value": {
591+
"type": "text"
578592
}
579593
}
580594
}
@@ -588,9 +602,9 @@ With the above mapping, we can index multiple passage vectors along with storing
588602
```console
589603
POST passage_vectors/_bulk?refresh=true
590604
{ "index": { "_id": "1" } }
591-
{ "full_text": "first paragraph another paragraph", "creation_time": "2019-05-04", "paragraph": [ { "vector": [ 0.45, 45 ], "text": "first paragraph", "paragraph_id": "1" }, { "vector": [ 0.8, 0.6 ], "text": "another paragraph", "paragraph_id": "2" } ] }
605+
{ "full_text": "first paragraph another paragraph", "creation_time": "2019-05-04", "paragraph": [ { "vector": [ 0.45, 45 ], "text": "first paragraph", "paragraph_id": "1", "language": "EN" }, { "vector": [ 0.8, 0.6 ], "text": "another paragraph", "paragraph_id": "2", "language": "FR" } ], "metadata": [ { "key": "author", "value": "Jane Doe" }, { "key": "source", "value": "Internal Memo" } ] }
592606
{ "index": { "_id": "2" } }
593-
{ "full_text": "number one paragraph number two paragraph", "creation_time": "2020-05-04", "paragraph": [ { "vector": [ 1.2, 4.5 ], "text": "number one paragraph", "paragraph_id": "1" }, { "vector": [ -1, 42 ], "text": "number two paragraph", "paragraph_id": "2" } ] }
607+
{ "full_text": "number one paragraph number two paragraph", "creation_time": "2020-05-04", "paragraph": [ { "vector": [ 1.2, 4.5 ], "text": "number one paragraph", "paragraph_id": "1", "language": "EN" }, { "vector": [ -1, 42 ], "text": "number two paragraph", "paragraph_id": "2", "language": "EN" }] , "metadata": [ { "key": "author", "value": "Jane Austen" }, { "key": "source", "value": "Financial" } ] }
594608
```
595609

596610
The query will seem very similar to a typical kNN search:
@@ -606,8 +620,7 @@ POST passage_vectors/_search
606620
45
607621
],
608622
"field": "paragraph.vector",
609-
"k": 2,
610-
"num_candidates": 2
623+
"k": 2
611624
}
612625
}
613626
```
@@ -662,10 +675,14 @@ Note below that even though we have 4 total vectors, we still return two documen
662675
}
663676
```
664677

665-
What if you wanted to filter by some top-level document metadata? You can do this by adding `filter` to your `knn` clause.
678+
#### Filtering in nested KNN search [nested-knn-search-filtering]
679+
What if you wanted to filter by some top-level or nested metadata? You can do this by adding `filter` to your `knn` clause.
666680

667681
::::{note}
668-
`filter` will always be over the top-level document metadata. This means you cannot filter based on `nested` field metadata.
682+
To ensure correct results: each individual filter must be either over
683+
the top-level metadata or `nested` metadata. However, a single knn search
684+
supports multiple filters, where some filters can be over the top-level
685+
metadata and some over nested.
669686
::::
670687

671688

@@ -678,26 +695,16 @@ POST passage_vectors/_search
678695
],
679696
"_source": false,
680697
"knn": {
681-
"query_vector": [
682-
0.45,
683-
45
684-
],
698+
"query_vector": [0.45, 45],
685699
"field": "paragraph.vector",
686700
"k": 2,
687-
"num_candidates": 2,
688701
"filter": {
689-
"bool": {
690-
"filter": [
691-
{
692-
"range": {
693-
"creation_time": {
694-
"gte": "2019-05-01",
695-
"lte": "2019-05-05"
696-
}
697-
}
698-
}
699-
]
700-
}
702+
"range": {
703+
"creation_time": {
704+
"gte": "2019-05-01",
705+
"lte": "2019-05-05"
706+
}
707+
}
701708
}
702709
}
703710
}
@@ -740,6 +747,87 @@ Now we have filtered based on the top level `"creation_time"` and only one docum
740747
}
741748
```
742749

750+
Below we filter on nested metadata.
751+
When scoring parents' documents this query only considers vectors that
752+
have "paragraph.language" set to "EN".
753+
754+
```console
755+
POST passage_vectors/_search
756+
{
757+
"fields": [
758+
"full_text"
759+
],
760+
"_source": false,
761+
"knn": {
762+
"query_vector": [0.45, 45],
763+
"field": "paragraph.vector",
764+
"k": 2,
765+
"filter": {
766+
"match": {
767+
"paragraph.language": "EN"
768+
}
769+
}
770+
}
771+
}
772+
```
773+
774+
Below we have two filters: one over nested metadata
775+
and another over the top level metadata. For scoring parents' documents,
776+
this query only considers vectors that have "paragraph.language" set to "EN"
777+
and whose parent's have creation time within the request range.
778+
779+
```console
780+
POST passage_vectors/_search
781+
{
782+
"fields": [
783+
"full_text"
784+
],
785+
"_source": false,
786+
"knn": {
787+
"query_vector": [0.45,45],
788+
"field": "paragraph.vector",
789+
"k": 2,
790+
"filter": [
791+
{"match": {"paragraph.language": "EN"}},
792+
{"range": { "creation_time": { "gte": "2019-05-01", "lte": "2019-05-05"}}}
793+
]
794+
}
795+
}
796+
```
797+
798+
#### Filtering by sibling nested fields in nested KNN search [nested-knn-search-filtering-sibling]
799+
Nested knn search also allows to do pre-filtering on sibling nested fields.
800+
For example, given "paragraphs" and "metadata" as nested fields, we can search "paragraphs.vector" and filter by "metadata.key" and "metadata.value".
801+
802+
```console
803+
POST passage_vectors/_search
804+
{
805+
"fields": [
806+
"full_text"
807+
],
808+
"_source": false,
809+
"knn": {
810+
"query_vector": [0.45, 45],
811+
"field": "paragraph.vector",
812+
"k": 2,
813+
"filter": {
814+
"nested": {
815+
"path": "metadata",
816+
"query": {
817+
"bool": {
818+
"must": [
819+
{ "match": { "metadata.key": "author" } },
820+
{ "match": { "metadata.value": "Doe" } }
821+
]
822+
}
823+
}
824+
}
825+
}
826+
}
827+
}
828+
```
829+
830+
Retrieving "inner_hits" when filtering on sibling nested fiels is not supported.
743831

744832
### Nested kNN Search with Inner hits [nested-knn-search-inner-hits]
745833

0 commit comments

Comments
 (0)