From a83ecbc8e23e8c495b9d8942cf8939ccaf082504 Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Mon, 28 Apr 2025 17:28:15 -0700 Subject: [PATCH 1/8] wip --- .../index/mapper/TextFieldMapper.java | 6 ++++++ .../blockloader/TextFieldBlockLoaderTests.java | 15 +-------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index d77817915f387..c5405af1292b6 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -1087,6 +1087,12 @@ public Builder builder(BlockFactory factory, int expectedCount) { * using whatever */ private BlockSourceReader.LeafIteratorLookup blockReaderDisiLookup(BlockLoaderContext blContext) { + if (isSyntheticSource && syntheticSourceDelegate != null) { + // Since we are using synthetic source and a delegate, we can't use this field + // to determine if the delegate has values in the document (f.e. handling of `null` is different + // between text and keyword). + return BlockSourceReader.lookupMatchingAll(); + } if (isIndexed()) { if (getTextSearchInfo().hasNorms()) { return BlockSourceReader.lookupFromNorms(name()); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java index 5c9acaf18a45d..77c42740451ee 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java @@ -62,15 +62,8 @@ public static Object expectedValue(Map fieldMapping, Object valu if (params.syntheticSource() && testContext.forceFallbackSyntheticSource() == false && usingSyntheticSourceDelegate) { var nullValue = (String) keywordMultiFieldMapping.get("null_value"); - // Due to how TextFieldMapper#blockReaderDisiLookup works this is complicated. - // If we are using lookupMatchingAll() then we'll see all docs, generate synthetic source using syntheticSourceDelegate, - // parse it and see null_value inside. - // But if we are using lookupFromNorms() we will skip the document (since the text field itself does not exist). - // Same goes for lookupFromFieldNames(). - boolean textFieldIndexed = (boolean) fieldMapping.getOrDefault("index", true); - if (value == null) { - if (textFieldIndexed == false && nullValue != null && nullValue.length() <= (int) ignoreAbove) { + if (nullValue != null && nullValue.length() <= (int) ignoreAbove) { return new BytesRef(nullValue); } @@ -82,12 +75,6 @@ public static Object expectedValue(Map fieldMapping, Object valu } var values = (List) value; - - // See note above about TextFieldMapper#blockReaderDisiLookup. - if (textFieldIndexed && values.stream().allMatch(Objects::isNull)) { - return null; - } - var indexed = values.stream() .map(s -> s == null ? nullValue : s) .filter(Objects::nonNull) From b8d335945fa6c6feb862749681bf47da4751f09c Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Tue, 29 Apr 2025 10:55:59 -0700 Subject: [PATCH 2/8] Update docs/changelog/127525.yaml --- docs/changelog/127525.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/127525.yaml diff --git a/docs/changelog/127525.yaml b/docs/changelog/127525.yaml new file mode 100644 index 0000000000000..ef1ee5c526b2d --- /dev/null +++ b/docs/changelog/127525.yaml @@ -0,0 +1,5 @@ +pr: 127525 +summary: Text field block loader properly handles null values from delegate +area: ES|QL +type: bug +issues: [] From 2be71427b8eb54630dae33c7eecca9ca04bb659f Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Tue, 29 Apr 2025 13:25:01 -0700 Subject: [PATCH 3/8] docs --- .../mapping-reference/keyword.md | 36 +++++++++++ .../elasticsearch/mapping-reference/text.md | 64 ++++--------------- .../index/mapper/TextFieldMapper.java | 1 + 3 files changed, 50 insertions(+), 51 deletions(-) diff --git a/docs/reference/elasticsearch/mapping-reference/keyword.md b/docs/reference/elasticsearch/mapping-reference/keyword.md index c392a71b59499..51441abcd5213 100644 --- a/docs/reference/elasticsearch/mapping-reference/keyword.md +++ b/docs/reference/elasticsearch/mapping-reference/keyword.md @@ -232,6 +232,42 @@ Will become: } ``` +If `null_value` is configured, `null` values are replaced with the `null_value` in synthetic source: + +$$$synthetic-source-keyword-example-null-value$$$ + +```console +PUT idx +{ + "settings": { + "index": { + "mapping": { + "source": { + "mode": "synthetic" + } + } + } + }, + "mappings": { + "properties": { + "kwd": { "type": "keyword", "null_value": "NA" } + } + } +} +PUT idx/_doc/1 +{ + "kwd": ["foo", null, "bar"] +} +``` + +Will become: + +```console-result +{ + "kwd": ["bar", "foo", "NA"] +} +``` + ## Constant keyword field type [constant-keyword-field-type] diff --git a/docs/reference/elasticsearch/mapping-reference/text.md b/docs/reference/elasticsearch/mapping-reference/text.md index 7295f2233d777..122c8177014a7 100644 --- a/docs/reference/elasticsearch/mapping-reference/text.md +++ b/docs/reference/elasticsearch/mapping-reference/text.md @@ -104,11 +104,20 @@ Synthetic `_source` is Generally Available only for TSDB indices (indices that h :::: -`text` fields support [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) if they have a [`keyword`](/reference/elasticsearch/mapping-reference/keyword.md#keyword-synthetic-source) sub-field that supports synthetic `_source` or if the `text` field sets `store` to `true`. Either way, it may not have [`copy_to`](/reference/elasticsearch/mapping-reference/copy-to.md). +`text` fields may use a [`keyword`](/reference/elasticsearch/mapping-reference/keyword.md#keyword-synthetic-source) sub-field to support [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) without storing values of the text field itself. + +::::{note} +Synthetic source of the `text` field will have the same [modifications](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) as a `keyword` field in this case. + +These modifications can impact usage of `text` fields: +* Reordering text fields can have an effect on [phrase](/reference/query-languages/query-dsl/query-dsl-match-query-phrase.md) and [span](/reference/query-languages/query-dsl/span-queries.md) queries. See the discussion about [`position_increment_gap`](/reference/elasticsearch/mapping-reference/position-increment-gap.md) for more detail. You can avoid this by making sure the `slop` parameter on the phrase queries is lower than the `position_increment_gap`. This is the default. +* Handling of `null` values is different. `text` fields ignore `null` values but `keyword` fields support replacing `null`s with a value specified in the `null_value` parameter. This replacement will be represented in synthetic source. +:::: -If using a sub-`keyword` field, then the values are sorted in the same way as a `keyword` field’s values are sorted. By default, that means sorted with duplicates removed. So: -$$$synthetic-source-text-example-default$$$ +If the `text` field sets `store` to `true` then the sub-field is not used and modifications mentioned above do not apply. + +$$$synthetic-source-text-example-stored$$$ ```console PUT idx @@ -126,6 +135,7 @@ PUT idx "properties": { "text": { "type": "text", + "store": true, "fields": { "raw": { "type": "keyword" @@ -147,54 +157,6 @@ PUT idx/_doc/1 Will become: -```console-result -{ - "text": [ - "jumped over the lazy dog", - "the quick brown fox" - ] -} -``` - -::::{note} -Reordering text fields can have an effect on [phrase](/reference/query-languages/query-dsl/query-dsl-match-query-phrase.md) and [span](/reference/query-languages/query-dsl/span-queries.md) queries. See the discussion about [`position_increment_gap`](/reference/elasticsearch/mapping-reference/position-increment-gap.md) for more detail. You can avoid this by making sure the `slop` parameter on the phrase queries is lower than the `position_increment_gap`. This is the default. -:::: - - -If the `text` field sets `store` to true then order and duplicates are preserved. - -$$$synthetic-source-text-example-stored$$$ - -```console -PUT idx -{ - "settings": { - "index": { - "mapping": { - "source": { - "mode": "synthetic" - } - } - } - }, - "mappings": { - "properties": { - "text": { "type": "text", "store": true } - } - } -} -PUT idx/_doc/1 -{ - "text": [ - "the quick brown fox", - "the quick brown fox", - "jumped over the lazy dog" - ] -} -``` - -Will become: - ```console-result { "text": [ diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index c5405af1292b6..08cef586e1438 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -1093,6 +1093,7 @@ private BlockSourceReader.LeafIteratorLookup blockReaderDisiLookup(BlockLoaderCo // between text and keyword). return BlockSourceReader.lookupMatchingAll(); } + if (isIndexed()) { if (getTextSearchInfo().hasNorms()) { return BlockSourceReader.lookupFromNorms(name()); From d9dbc4dec8109909d819e0d0d200cbe99f03bed5 Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Tue, 29 Apr 2025 13:26:34 -0700 Subject: [PATCH 4/8] Delete docs/changelog/127525.yaml --- docs/changelog/127525.yaml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 docs/changelog/127525.yaml diff --git a/docs/changelog/127525.yaml b/docs/changelog/127525.yaml deleted file mode 100644 index ef1ee5c526b2d..0000000000000 --- a/docs/changelog/127525.yaml +++ /dev/null @@ -1,5 +0,0 @@ -pr: 127525 -summary: Text field block loader properly handles null values from delegate -area: ES|QL -type: bug -issues: [] From ef97b0a7835bdb0144bd3dc268051b5997ad8d76 Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Mon, 5 May 2025 14:03:10 -0700 Subject: [PATCH 5/8] Update docs/reference/elasticsearch/mapping-reference/text.md Co-authored-by: shainaraskas <58563081+shainaraskas@users.noreply.github.com> --- docs/reference/elasticsearch/mapping-reference/text.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/elasticsearch/mapping-reference/text.md b/docs/reference/elasticsearch/mapping-reference/text.md index 122c8177014a7..4076d3fc469d0 100644 --- a/docs/reference/elasticsearch/mapping-reference/text.md +++ b/docs/reference/elasticsearch/mapping-reference/text.md @@ -104,7 +104,7 @@ Synthetic `_source` is Generally Available only for TSDB indices (indices that h :::: -`text` fields may use a [`keyword`](/reference/elasticsearch/mapping-reference/keyword.md#keyword-synthetic-source) sub-field to support [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) without storing values of the text field itself. +`text` fields can use a [`keyword`](/reference/elasticsearch/mapping-reference/keyword.md#keyword-synthetic-source) sub-field to support [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) without storing values of the text field itself. ::::{note} Synthetic source of the `text` field will have the same [modifications](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) as a `keyword` field in this case. From 07e2c158f02dfab8f5d6d3b4fb1fb1aaa0d96f8e Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Mon, 5 May 2025 14:03:46 -0700 Subject: [PATCH 6/8] Update docs/reference/elasticsearch/mapping-reference/text.md Co-authored-by: shainaraskas <58563081+shainaraskas@users.noreply.github.com> --- docs/reference/elasticsearch/mapping-reference/text.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/elasticsearch/mapping-reference/text.md b/docs/reference/elasticsearch/mapping-reference/text.md index 4076d3fc469d0..fa37b6fc71b49 100644 --- a/docs/reference/elasticsearch/mapping-reference/text.md +++ b/docs/reference/elasticsearch/mapping-reference/text.md @@ -115,7 +115,7 @@ These modifications can impact usage of `text` fields: :::: -If the `text` field sets `store` to `true` then the sub-field is not used and modifications mentioned above do not apply. +If the `text` field sets `store` to `true` then the sub-field is not used and no modifications are applied. $$$synthetic-source-text-example-stored$$$ From cd03338b26d01f6434d1d21239c37e20fcde1f91 Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Mon, 5 May 2025 14:04:45 -0700 Subject: [PATCH 7/8] Update docs/reference/elasticsearch/mapping-reference/text.md Co-authored-by: shainaraskas <58563081+shainaraskas@users.noreply.github.com> --- docs/reference/elasticsearch/mapping-reference/text.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/reference/elasticsearch/mapping-reference/text.md b/docs/reference/elasticsearch/mapping-reference/text.md index fa37b6fc71b49..4c3335efcb55b 100644 --- a/docs/reference/elasticsearch/mapping-reference/text.md +++ b/docs/reference/elasticsearch/mapping-reference/text.md @@ -106,13 +106,11 @@ Synthetic `_source` is Generally Available only for TSDB indices (indices that h `text` fields can use a [`keyword`](/reference/elasticsearch/mapping-reference/keyword.md#keyword-synthetic-source) sub-field to support [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) without storing values of the text field itself. -::::{note} -Synthetic source of the `text` field will have the same [modifications](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) as a `keyword` field in this case. +In this case, the synthetic source of the `text` field will have the same [modifications](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) as a `keyword` field. These modifications can impact usage of `text` fields: -* Reordering text fields can have an effect on [phrase](/reference/query-languages/query-dsl/query-dsl-match-query-phrase.md) and [span](/reference/query-languages/query-dsl/span-queries.md) queries. See the discussion about [`position_increment_gap`](/reference/elasticsearch/mapping-reference/position-increment-gap.md) for more detail. You can avoid this by making sure the `slop` parameter on the phrase queries is lower than the `position_increment_gap`. This is the default. -* Handling of `null` values is different. `text` fields ignore `null` values but `keyword` fields support replacing `null`s with a value specified in the `null_value` parameter. This replacement will be represented in synthetic source. -:::: +* Reordering text fields can have an effect on [phrase](/reference/query-languages/query-dsl/query-dsl-match-query-phrase.md) and [span](/reference/query-languages/query-dsl/span-queries.md) queries. See the discussion about [`position_increment_gap`](/reference/elasticsearch/mapping-reference/position-increment-gap.md) for more details. You can avoid this by making sure the `slop` parameter on the phrase queries is lower than the `position_increment_gap`. This is the default. +* Handling of `null` values is different. `text` fields ignore `null` values, but `keyword` fields support replacing `null` values with a value specified in the `null_value` parameter. This replacement is represented in synthetic source. If the `text` field sets `store` to `true` then the sub-field is not used and no modifications are applied. From 2ade47d0e6d66c491a14739843c6d6946195be3a Mon Sep 17 00:00:00 2001 From: Oleksandr Kolomiiets Date: Tue, 6 May 2025 09:09:12 -0700 Subject: [PATCH 8/8] add an example --- .../elasticsearch/mapping-reference/text.md | 55 ++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/docs/reference/elasticsearch/mapping-reference/text.md b/docs/reference/elasticsearch/mapping-reference/text.md index 4c3335efcb55b..ff7d698c4bf61 100644 --- a/docs/reference/elasticsearch/mapping-reference/text.md +++ b/docs/reference/elasticsearch/mapping-reference/text.md @@ -112,8 +112,61 @@ These modifications can impact usage of `text` fields: * Reordering text fields can have an effect on [phrase](/reference/query-languages/query-dsl/query-dsl-match-query-phrase.md) and [span](/reference/query-languages/query-dsl/span-queries.md) queries. See the discussion about [`position_increment_gap`](/reference/elasticsearch/mapping-reference/position-increment-gap.md) for more details. You can avoid this by making sure the `slop` parameter on the phrase queries is lower than the `position_increment_gap`. This is the default. * Handling of `null` values is different. `text` fields ignore `null` values, but `keyword` fields support replacing `null` values with a value specified in the `null_value` parameter. This replacement is represented in synthetic source. +For example: -If the `text` field sets `store` to `true` then the sub-field is not used and no modifications are applied. +$$$synthetic-source-text-example-multi-field$$$ + +```console +PUT idx +{ + "settings": { + "index": { + "mapping": { + "source": { + "mode": "synthetic" + } + } + } + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "fields": { + "kwd": { + "type": "keyword", + "null_value": "NA" + } + } + } + } + } +} +PUT idx/_doc/1 +{ + "text": [ + null, + "the quick brown fox", + "the quick brown fox", + "jumped over the lazy dog", + ] +} +``` + +Will become: + +```console-result +{ + "text": [ + "jumped over the lazy dog" + "NA", + "the quick brown fox" + ] +} +``` + + +If the `text` field sets `store` to `true` then the sub-field is not used and no modifications are applied. For example: $$$synthetic-source-text-example-stored$$$