Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/144112.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
area: ES|QL
issues:
- 143916
pr: 144112
summary: Loading unmapped fields on synthetic `_source`
type: bug
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ private static boolean indexSortConfigByHostName(final IndexSortConfig indexSort

public static final TypeParser PARSER = createTypeParserWithLegacySupport(Builder::new);

public static final class KeywordFieldType extends TextFamilyFieldType {
public static class KeywordFieldType extends TextFamilyFieldType {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't change this class unless someone from the core ES team approves it. My advice is to seek approval for this from the relevant team.


private static final IgnoreAbove IGNORE_ABOVE_DEFAULT = new IgnoreAbove(null, IndexMode.STANDARD);

Expand Down Expand Up @@ -933,8 +933,7 @@ private FallbackSyntheticSourceBlockLoader.Reader<?> fallbackSyntheticSourceBloc
return new FallbackSyntheticSourceBlockLoader.SingleValueReader<BytesRef>(nullValueBytes) {
@Override
public void convertValue(Object value, List<BytesRef> accumulator) {
String stringValue = ((BytesRef) value).utf8ToString();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're fine to change this line rather than making KeywordFieldType non-final. ie:

String stringValue = value instanceof BytesRef br ? br.utf8ToString() : value.toString();

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to be extra defensive and avoid changing behavior if the field is not potentially unmapped, but if you're fine with it then I'm fine with it :)

String adjusted = applyIgnoreAboveAndNormalizer(stringValue);
String adjusted = applyIgnoreAboveAndNormalizer(sourceValueToString(value));
if (adjusted != null) {
// TODO what if the value didn't change?
accumulator.add(new BytesRef(adjusted));
Expand Down Expand Up @@ -962,6 +961,10 @@ public void writeToBlock(List<BytesRef> values, BlockLoader.Builder blockBuilder
};
}

protected String sourceValueToString(Object value) {
return ((BytesRef) value).utf8ToString();
}

private BlockSourceReader.LeafIteratorLookup sourceBlockLoaderLookup(BlockLoaderContext blContext) {
if (getTextSearchInfo().hasNorms()) {
return BlockSourceReader.lookupFromNorms(name());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ public class CsvTestsDataLoader {
new TestDataset("ul_logs"),
new TestDataset("sample_data"),
new TestDataset("partial_mapping_sample_data"),
new TestDataset(
"partial_mapping_synthetic_sample_data",
"mapping-partial_mapping_sample_data.json",
"partial_mapping_sample_data.csv"
).withSetting("partial-mapping-synthetic-settings.json"),
new TestDataset("no_mapping_sample_data", "mapping-no_mapping_sample_data.json", "partial_mapping_sample_data.csv").withTypeMapping(
Stream.of("timestamp", "client_ip", "event_duration").collect(toMap(k -> k, k -> "keyword"))
),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@timestamp:date,client_ip:ip,event_duration:long,message:keyword,unmapped_message:keyword,unmapped_event_duration:keyword,unmapped.nested:keyword
@timestamp:date,client_ip:ip,event_duration:long,message:keyword,unmapped_message:keyword,unmapped_event_duration:long,unmapped.nested:keyword
2024-10-23T13:55:01.543Z,173.21.3.15,1756466,Connected to 10.1.0.1!,Disconnected from 10.1.0.1,1756468,a
2024-10-23T13:53:55.832Z,173.21.3.15,5033754,Connection error?,Disconnection error,5033756,b
2024-10-23T13:52:55.015Z,173.21.3.15,8268152,Connection error?,Disconnection error,8268154,c
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
"@timestamp": {
"type": "date"
},
"cluster": {
"type": "keyword",
"time_series_dimension": true
},
"pod": {
"type": "keyword",
"time_series_dimension": true
},
"client": {
"properties": {
"ip": {
Expand Down Expand Up @@ -31,9 +39,6 @@
},
"network": {
"properties": {
"bytes_in": {
"type": "long"
},
"total_bytes_in": {
"type": "long",
"time_series_metric": "counter",
Expand All @@ -54,11 +59,6 @@
"up": {
"type": "boolean"
},
"tx": {
"type": "integer",
"time_series_metric": "gauge",
"meta": { "unit": "packets" }
},
"rx": {
"type": "integer",
"time_series_metric": "gauge",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"index": {
"mapping": {
"source": {
"mode": "synthetic"
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,23 @@ FROM partial_mapping_no_source_sample_data
2024-10-23T12:15:03.360Z | null
;

unmappedNumericFromSyntheticSourceSingleIndex
required_capability: optional_fields_v2

SET unmapped_fields="load"\;
FROM partial_mapping_synthetic_sample_data
| KEEP @timestamp, unmapped_event_duration
| SORT @timestamp DESC
| LIMIT 4
;

@timestamp:date | unmapped_event_duration:keyword
2024-10-23T13:55:01.543Z | 1756468
2024-10-23T13:53:55.832Z | 5033756
2024-10-23T13:52:55.015Z | 8268154
2024-10-23T13:51:54.732Z | 725449
;

statsByUnmappedFieldExistsInSource
required_capability: optional_fields_v2

Expand Down Expand Up @@ -249,6 +266,25 @@ FROM k8s_unmapped
2024-05-10T00:01:25.000Z | qa | null
;

unmappedNumericFromK8sSyntheticTsIndex
required_capability: optional_fields_v2
required_capability: ts_command_v0

SET unmapped_fields="load"\;
TS k8s_unmapped
| KEEP @timestamp, cluster, network.cost
| SORT @timestamp
| LIMIT 5
;

@timestamp:date | cluster:keyword | network.cost:keyword
2024-05-10T00:00:29.000Z | staging | 9.375
2024-05-10T00:00:33.000Z | staging | 1.25
2024-05-10T00:00:51.000Z | prod | 9.25
2024-05-10T00:00:57.000Z | prod | 12.125
2024-05-10T00:01:25.000Z | qa | 5.375
;

statsImplicitLastOverTimeUnmappedTsIndex
required_capability: optional_fields_v2
required_capability: ts_command_v0
Expand Down Expand Up @@ -281,15 +317,15 @@ TS k8s_unmapped

s:long | c:long | event_log:keyword
6 | 1 | Aenean hime
6 | 1 | Cubilia lac
6 | 1 | Lorem ipsum
6 | 1 | Lorem ipsum
6 | 1 | Nunc
6 | 1 | Nunc tortor
6 | 1 | Placerat mi
3 | 1 | Cubilia lac
2 | 1 | Lorem ipsum
3 | 1 | Lorem ipsum
2 | 1 | Nunc
4 | 1 | Nunc tortor
7 | 1 | Placerat mi
4 | 1 | Scelerisque
6 | 1 | Scelerisque
6 | 1 | Scelerisque
6 | 1 | Sed ultrici
1 | 1 | Sed ultrici
;

explicitLastOverTimeUnmappedTsIndex
Expand All @@ -305,11 +341,11 @@ TS k8s_unmapped
;

cluster:keyword | time_bucket:datetime | x:keyword
prod | 2024-05-10T00:00:00.000Z | vulputate su
prod | 2024-05-10T00:10:00.000Z | vulputate qu
prod | 2024-05-10T00:00:00.000Z | vivamus, soc
prod | 2024-05-10T00:10:00.000Z | vivamus, soc
prod | 2024-05-10T00:20:00.000Z | volutpat ves
qa | 2024-05-10T00:00:00.000Z | vulputate su
qa | 2024-05-10T00:10:00.000Z | vulputate qu
qa | 2024-05-10T00:00:00.000Z | vulputate qu
qa | 2024-05-10T00:10:00.000Z | volutpat hac
;

fieldIsNestedAndUnmapped
Expand Down Expand Up @@ -897,6 +933,56 @@ us | 2024-05-10T00:04:00.000Z | k8s
us | 2024-05-10T00:04:00.000Z | k8s_unmapped
;

// Copied over from max_over_time_with_filtering, except bytes_in is unmapped.
maxOverTimeWithFiltering
required_capability: optional_fields_v2
required_capability: ts_command_v0

SET unmapped_fields="load"\;
TS k8s_unmapped
| WHERE pod == "one"
| STATS tx = sum(max_over_time(network.bytes_in :: long)) BY cluster, time_bucket = bucket(@timestamp, 10minute)
| SORT time_bucket, cluster
| LIMIT 10
;

tx:long | cluster:keyword | time_bucket:datetime
970 | prod | 2024-05-10T00:00:00.000Z
842 | qa | 2024-05-10T00:00:00.000Z
753 | staging | 2024-05-10T00:00:00.000Z
990 | prod | 2024-05-10T00:10:00.000Z
1006 | qa | 2024-05-10T00:10:00.000Z
947 | staging | 2024-05-10T00:10:00.000Z
953 | prod | 2024-05-10T00:20:00.000Z
917 | qa | 2024-05-10T00:20:00.000Z
749 | staging | 2024-05-10T00:20:00.000Z
;

// Copied over from eval_on_max_over_time, except bytes_in is unmapped.
evalOnMaxOverTime
required_capability: optional_fields_v2
required_capability: ts_command_v0

SET unmapped_fields="load"\;
TS k8s_unmapped
| STATS max_bytes = avg(max_over_time(network.bytes_in :: long)) BY cluster, time_bucket = bucket(@timestamp, 10minute)
| EVAL kb_minus_offset = (max_bytes - 100) / 1000.0
| LIMIT 10
| SORT time_bucket, cluster
;

max_bytes:double | cluster:keyword | time_bucket:datetime | kb_minus_offset:double
909.3333333333334 | prod | 2024-05-10T00:00:00.000Z | 0.8093333333333333
908.6666666666666 | qa | 2024-05-10T00:00:00.000Z | 0.8086666666666666
794.0 | staging | 2024-05-10T00:00:00.000Z | 0.694
1005.0 | prod | 2024-05-10T00:10:00.000Z | 0.905
980.0 | qa | 2024-05-10T00:10:00.000Z | 0.88
917.6666666666666 | staging | 2024-05-10T00:10:00.000Z | 0.8176666666666667
846.3333333333334 | prod | 2024-05-10T00:20:00.000Z | 0.7463333333333334
941.6666666666666 | qa | 2024-05-10T00:20:00.000Z | 0.8416666666666667
786.0 | staging | 2024-05-10T00:20:00.000Z | 0.686
;

// https://github.com/elastic/elasticsearch/issues/143991
statsFilteredAggAfterEvalWithDottedUnmappedField
required_capability: optional_fields_v2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,23 @@ FROM partial_mapping_excluded_source_sample_data
2024-10-23T12:15:03.360Z | null
;

unmappedNumericFromSyntheticSourceSingleIndex
required_capability: optional_fields_nullify_tech_preview

SET unmapped_fields="nullify"\;
FROM partial_mapping_synthetic_sample_data
| KEEP @timestamp, unmapped_event_duration
| SORT @timestamp DESC
| LIMIT 4
;

@timestamp:date | unmapped_event_duration:null
2024-10-23T13:55:01.543Z | null
2024-10-23T13:53:55.832Z | null
2024-10-23T13:52:55.015Z | null
2024-10-23T13:51:54.732Z | null
;

fieldUnmappedInSourceButSourceDisabledSingleIndex
required_capability: optional_fields_nullify_tech_preview
required_capability: source_field_mapping
Expand Down Expand Up @@ -86,6 +103,25 @@ nanos:date_nanos
2023-01-23T13:55:01.543123456Z
;

unmappedNumericFromK8sSyntheticTsIndex
required_capability: optional_fields_nullify_tech_preview
required_capability: ts_command_v0

SET unmapped_fields="nullify"\;
TS k8s_unmapped
| KEEP @timestamp, cluster, network.cost
| SORT @timestamp
| LIMIT 5
;

@timestamp:datetime | cluster:keyword | network.cost:null
2024-05-10T00:00:29.000Z | staging | null
2024-05-10T00:00:33.000Z | staging | null
2024-05-10T00:00:51.000Z | prod | null
2024-05-10T00:00:57.000Z | prod | null
2024-05-10T00:01:25.000Z | qa | null
;

keepStar
required_capability: optional_fields_nullify_tech_preview

Expand Down Expand Up @@ -381,8 +417,8 @@ FROM languages
| STATS c = COUNT(*) BY does_not_exist
;

c:long |does_not_exist:null
4 |null
c:long | does_not_exist:null
4 | null
;

statsGroupAliasShadowingSourceColumnNoFilter
Expand All @@ -393,8 +429,8 @@ FROM languages
| STATS c = COUNT(*) BY language_code = does_not_exist
;

c:long |language_code:null
4 |null
c:long | language_code:null
4 | null
;

statsGroupAliasShadowingSourceColumnWithFilter
Expand All @@ -406,8 +442,8 @@ FROM languages
| STATS c = COUNT(*) BY language_code = does_not_exist, language_name
;

c:long |language_code:null |language_name :keyword
1 |null |English
c:long | language_code:null | language_name :keyword
1 | null | English
;

statsGroupAliasShadowingSourceColumnWithFilterAndAggExpression
Expand All @@ -420,8 +456,8 @@ FROM languages
BY language_code = does_not_exist1::INTEGER + does_not_exist2::INTEGER + language_code, language_name
;

c:long |language_code:integer |language_name :keyword
1 |null |English
c:long | language_code:integer | language_name :keyword
1 | null | English
;

inlinestatsSum
Expand Down Expand Up @@ -550,11 +586,11 @@ FROM sample_data, sample_data_str,
| SORT client_ip
;

client_ip:ip |foo:null |bar:keyword |baz:null
172.21.0.5 |null |null |null
172.21.2.113 |null |null |null
172.21.2.162 |null |null |null
172.21.3.15 |null |null |null
client_ip:ip | foo:null | bar:keyword | baz:null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes were unnecessary.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course not, but I used a script to fix the alignments for the tests I did add, and it just makes it easier to align the entire file rather than align a subset of it. I can revert it if you think it's a blocker.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely not a blocker.

172.21.0.5 | null | null | null
172.21.2.113 | null | null | null
172.21.2.162 | null | null | null
172.21.3.15 | null | null | null
;

forkBranchesWithDifferentSchemas
Expand Down Expand Up @@ -792,12 +828,12 @@ TS k8s_unmapped
| LIMIT 5
;

@timestamp:datetime | cluster:null | network.cost:null
2024-05-10T00:00:29.000Z | null | null
2024-05-10T00:00:33.000Z | null | null
2024-05-10T00:00:51.000Z | null | null
2024-05-10T00:00:57.000Z | null | null
2024-05-10T00:01:25.000Z | null | null
@timestamp:datetime | cluster:keyword | network.cost:null
2024-05-10T00:00:29.000Z | staging | null
2024-05-10T00:00:33.000Z | staging | null
2024-05-10T00:00:51.000Z | prod | null
2024-05-10T00:00:57.000Z | prod | null
2024-05-10T00:01:25.000Z | qa | null
;

rateOnUnmappedTsIndex
Expand Down Expand Up @@ -1005,7 +1041,7 @@ FROM employees
| EVAL y = coalesce(bar, baz)
;

avg_worked_seconds:long | birth_date:date | emp_no:integer | first_name:keyword | gender:keyword | height:double | height.float:double | height.half_float:double | height.scaled_float:double | hire_date:date | is_rehired:boolean | job_positions:keyword | languages:integer | languages.byte:integer | languages.long:long | languages.short:integer | last_name:keyword | salary:integer | salary_change:double | salary_change.int:integer | salary_change.keyword:keyword | salary_change.long:long | still_hired:boolean | foo:null | bar:null | baz:null | y:null
avg_worked_seconds:long | birth_date:date | emp_no:integer | first_name:keyword | gender:keyword | height:double | height.float:double | height.half_float:double | height.scaled_float:double | hire_date:date | is_rehired:boolean | job_positions:keyword | languages:integer | languages.byte:integer | languages.long:long | languages.short:integer | last_name:keyword | salary:integer | salary_change:double | salary_change.int:integer | salary_change.keyword:keyword | salary_change.long:long | still_hired:boolean | foo:null | bar:null | baz:null | y:null
;


Expand Down Expand Up @@ -1057,6 +1093,6 @@ ROW a = 12::long
| DROP a
;

_fork:keyword | x:long
fork1 | 12
_fork:keyword | x:long
fork1 | 12
;
Loading
Loading