From 117337e6d99df7e6e57341b1987297c7e83f7b88 Mon Sep 17 00:00:00 2001
From: Yang Wang
Date: Thu, 17 Jul 2025 11:44:47 +1000
Subject: [PATCH 1/7] Update regions_by_endpoint for AWS sdk upgrade.
Also add test to ensure the file has at least one entry for each region
so that it is easy to spot missing regions in future upgrades.
Relates: #131050
---
.../repositories/s3/regions_by_endpoint.txt | 16 ++++++++++++++++
.../s3/RegionFromEndpointGuesserTests.java | 13 +++++++++++++
2 files changed, 29 insertions(+)
diff --git a/modules/repository-s3/src/main/resources/org/elasticsearch/repositories/s3/regions_by_endpoint.txt b/modules/repository-s3/src/main/resources/org/elasticsearch/repositories/s3/regions_by_endpoint.txt
index 3fae5c314c10b..5ca027a5f4a13 100644
--- a/modules/repository-s3/src/main/resources/org/elasticsearch/repositories/s3/regions_by_endpoint.txt
+++ b/modules/repository-s3/src/main/resources/org/elasticsearch/repositories/s3/regions_by_endpoint.txt
@@ -6,6 +6,10 @@ ap-east-1 s3-fips.ap-east-1.amazonaws.com
ap-east-1 s3-fips.dualstack.ap-east-1.amazonaws.com
ap-east-1 s3.ap-east-1.amazonaws.com
ap-east-1 s3.dualstack.ap-east-1.amazonaws.com
+ap-east-2 s3-fips.ap-east-2.amazonaws.com
+ap-east-2 s3-fips.dualstack.ap-east-2.amazonaws.com
+ap-east-2 s3.ap-east-2.amazonaws.com
+ap-east-2 s3.dualstack.ap-east-2.amazonaws.com
ap-northeast-1 s3-fips.ap-northeast-1.amazonaws.com
ap-northeast-1 s3-fips.dualstack.ap-northeast-1.amazonaws.com
ap-northeast-1 s3.ap-northeast-1.amazonaws.com
@@ -56,6 +60,14 @@ aws-iso-b-global s3-fips.aws-iso-b-global.sc2s.sgov.gov
aws-iso-b-global s3-fips.dualstack.aws-iso-b-global.sc2s.sgov.gov
aws-iso-b-global s3.aws-iso-b-global.sc2s.sgov.gov
aws-iso-b-global s3.dualstack.aws-iso-b-global.sc2s.sgov.gov
+aws-iso-e-global s3-fips.aws-iso-e-global.cloud.adc-e.uk
+aws-iso-e-global s3-fips.dualstack.aws-iso-e-global.cloud.adc-e.uk
+aws-iso-e-global s3.aws-iso-e-global.cloud.adc-e.uk
+aws-iso-e-global s3.dualstack.aws-iso-e-global.cloud.adc-e.uk
+aws-iso-f-global s3-fips.aws-iso-f-global.csp.hci.ic.gov
+aws-iso-f-global s3-fips.dualstack.aws-iso-f-global.csp.hci.ic.gov
+aws-iso-f-global s3.aws-iso-f-global.csp.hci.ic.gov
+aws-iso-f-global s3.dualstack.aws-iso-f-global.csp.hci.ic.gov
aws-iso-global s3-fips.aws-iso-global.c2s.ic.gov
aws-iso-global s3-fips.dualstack.aws-iso-global.c2s.ic.gov
aws-iso-global s3.aws-iso-global.c2s.ic.gov
@@ -76,6 +88,10 @@ cn-north-1 s3.cn-north-1.amazonaws.com.cn
cn-north-1 s3.dualstack.cn-north-1.amazonaws.com.cn
cn-northwest-1 s3.cn-northwest-1.amazonaws.com.cn
cn-northwest-1 s3.dualstack.cn-northwest-1.amazonaws.com.cn
+eusc-de-east-1 s3-fips.eusc-de-east-1.amazonaws.eu
+eusc-de-east-1 s3-fips.dualstack.eusc-de-east-1.amazonaws.eu
+eusc-de-east-1 s3.eusc-de-east-1.amazonaws.eu
+eusc-de-east-1 s3.dualstack.eusc-de-east-1.amazonaws.eu
eu-central-1 s3-fips.dualstack.eu-central-1.amazonaws.com
eu-central-1 s3-fips.eu-central-1.amazonaws.com
eu-central-1 s3.dualstack.eu-central-1.amazonaws.com
diff --git a/modules/repository-s3/src/test/java/org/elasticsearch/repositories/s3/RegionFromEndpointGuesserTests.java b/modules/repository-s3/src/test/java/org/elasticsearch/repositories/s3/RegionFromEndpointGuesserTests.java
index 9fe0c40c83979..402181878b600 100644
--- a/modules/repository-s3/src/test/java/org/elasticsearch/repositories/s3/RegionFromEndpointGuesserTests.java
+++ b/modules/repository-s3/src/test/java/org/elasticsearch/repositories/s3/RegionFromEndpointGuesserTests.java
@@ -9,6 +9,11 @@
package org.elasticsearch.repositories.s3;
+import software.amazon.awssdk.endpoints.Endpoint;
+import software.amazon.awssdk.regions.Region;
+import software.amazon.awssdk.services.s3.endpoints.S3EndpointParams;
+import software.amazon.awssdk.services.s3.endpoints.internal.DefaultS3EndpointProvider;
+
import org.elasticsearch.core.Nullable;
import org.elasticsearch.test.ESTestCase;
@@ -23,6 +28,14 @@ public void testRegionGuessing() {
assertRegionGuess("random.endpoint.internal.net", null);
}
+ public void testHasEntryForEachRegion() {
+ final var defaultS3EndpointProvider = new DefaultS3EndpointProvider();
+ for (var region : Region.regions()) {
+ final Endpoint endpoint = safeGet(defaultS3EndpointProvider.resolveEndpoint(S3EndpointParams.builder().region(region).build()));
+ assertNotNull(region.id(), RegionFromEndpointGuesser.guessRegion(endpoint.url().toString()));
+ }
+ }
+
private static void assertRegionGuess(String endpoint, @Nullable String expectedRegion) {
assertEquals(endpoint, expectedRegion, RegionFromEndpointGuesser.guessRegion(endpoint));
}
From 1fad2c3e1bf08b8eabbc7db29e4b1650c1fe947f Mon Sep 17 00:00:00 2001
From: elasticsearchmachine
<58790826+elasticsearchmachine@users.noreply.github.com>
Date: Thu, 17 Jul 2025 09:43:49 +0200
Subject: [PATCH 2/7] Mute org.elasticsearch.packaging.test.DockerTests
test072RunEsAsDifferentUserAndGroup #131412
---
muted-tests.yml | 3 +++
1 file changed, 3 insertions(+)
diff --git a/muted-tests.yml b/muted-tests.yml
index 27ffc88873c70..6f920d95d4189 100644
--- a/muted-tests.yml
+++ b/muted-tests.yml
@@ -466,6 +466,9 @@ tests:
- class: org.elasticsearch.xpack.downsample.DataStreamLifecycleDownsampleDisruptionIT
method: testDataStreamLifecycleDownsampleRollingRestart
issue: https://github.com/elastic/elasticsearch/issues/131394
+- class: org.elasticsearch.packaging.test.DockerTests
+ method: test072RunEsAsDifferentUserAndGroup
+ issue: https://github.com/elastic/elasticsearch/issues/131412
# Examples:
#
From aadf40c156387793fd6bf3b0e8bd234d5d836d22 Mon Sep 17 00:00:00 2001
From: kanoshiou
Date: Thu, 17 Jul 2025 16:23:04 +0800
Subject: [PATCH 3/7] ESQL: Fix inconsistent column order in MV_EXPAND
(#129745)
The new attribute generated by MV_EXPAND should remain in the original position. The projection added by ProjectAwayColumns does not respect the original order of attributes.
Make ProjectAwayColumns respect the order of attributes to fix this.
---
docs/changelog/129745.yaml | 6 +
.../esql/core/expression/AttributeSet.java | 4 +
.../rest/generative/GenerativeRestTest.java | 1 -
.../src/main/resources/mv_expand.csv-spec | 62 +++
.../xpack/esql/action/EsqlCapabilities.java | 6 +
.../rules/physical/ProjectAwayColumns.java | 14 +-
.../optimizer/PhysicalPlanOptimizerTests.java | 500 +++++++++---------
7 files changed, 350 insertions(+), 243 deletions(-)
create mode 100644 docs/changelog/129745.yaml
diff --git a/docs/changelog/129745.yaml b/docs/changelog/129745.yaml
new file mode 100644
index 0000000000000..35cfd0671bd64
--- /dev/null
+++ b/docs/changelog/129745.yaml
@@ -0,0 +1,6 @@
+pr: 129745
+summary: "ESQL: Fix `mv_expand` inconsistent column order"
+area: ES|QL
+type: bug
+issues:
+ - 129000
diff --git a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/AttributeSet.java b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/AttributeSet.java
index d281db4e6bf63..fefaf3098319e 100644
--- a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/AttributeSet.java
+++ b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/AttributeSet.java
@@ -261,5 +261,9 @@ public boolean isEmpty() {
public AttributeSet build() {
return new AttributeSet(mapBuilder.build());
}
+
+ public void clear() {
+ mapBuilder.keySet().clear();
+ }
}
}
diff --git a/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java b/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java
index 2fefcccb4d14f..41b8658302bd7 100644
--- a/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java
+++ b/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java
@@ -53,7 +53,6 @@ public abstract class GenerativeRestTest extends ESRestTestCase {
"Data too large", // Circuit breaker exceptions eg. https://github.com/elastic/elasticsearch/issues/130072
// Awaiting fixes for correctness
- "Expecting the following columns \\[.*\\], got", // https://github.com/elastic/elasticsearch/issues/129000
"Expecting at most \\[.*\\] columns, got \\[.*\\]" // https://github.com/elastic/elasticsearch/issues/129561
);
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_expand.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_expand.csv-spec
index 20ce3ecc5a396..c7dbe01ef6f09 100644
--- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_expand.csv-spec
+++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_expand.csv-spec
@@ -419,3 +419,65 @@ emp_no:integer | job_positions:keyword
10001 | Accountant
10001 | Senior Python Developer
;
+
+testMvExpandInconsistentColumnOrder1
+required_capability: fix_mv_expand_inconsistent_column_order
+from message_types
+| eval foo_1 = 1, foo_2 = 2
+| sort message
+| mv_expand foo_1
+;
+
+message:keyword | type:keyword | foo_1:integer | foo_2:integer
+Connected to 10.1.0.1 | Success | 1 | 2
+Connected to 10.1.0.2 | Success | 1 | 2
+Connected to 10.1.0.3 | Success | 1 | 2
+Connection error | Error | 1 | 2
+Development environment | Development | 1 | 2
+Disconnected | Disconnected | 1 | 2
+Production environment | Production | 1 | 2
+;
+
+testMvExpandInconsistentColumnOrder2
+required_capability: fix_mv_expand_inconsistent_column_order
+from message_types
+| eval foo_1 = [1, 3], foo_2 = 2
+| sort message
+| mv_expand foo_1
+;
+
+message:keyword | type:keyword | foo_1:integer | foo_2:integer
+Connected to 10.1.0.1 | Success | 1 | 2
+Connected to 10.1.0.1 | Success | 3 | 2
+Connected to 10.1.0.2 | Success | 1 | 2
+Connected to 10.1.0.2 | Success | 3 | 2
+Connected to 10.1.0.3 | Success | 1 | 2
+Connected to 10.1.0.3 | Success | 3 | 2
+Connection error | Error | 1 | 2
+Connection error | Error | 3 | 2
+Development environment | Development | 1 | 2
+Development environment | Development | 3 | 2
+Disconnected | Disconnected | 1 | 2
+Disconnected | Disconnected | 3 | 2
+Production environment | Production | 1 | 2
+Production environment | Production | 3 | 2
+;
+
+testMvExpandInconsistentColumnOrder3
+required_capability: fix_mv_expand_inconsistent_column_order
+from message_types
+| sort type
+| eval language_code = 1, `language_name` = false, message = true, foo_3 = 1, foo_2 = null
+| eval foo_3 = "1", `foo_3` = -1, foo_1 = 1, `language_code` = null, `foo_2` = "1"
+| mv_expand foo_1
+| limit 5
+;
+
+type:keyword | language_name:boolean | message:boolean | foo_3:integer | foo_1:integer | language_code:null | foo_2:keyword
+Development | false | true | -1 | 1 | null | 1
+Disconnected | false | true | -1 | 1 | null | 1
+Error | false | true | -1 | 1 | null | 1
+Production | false | true | -1 | 1 | null | 1
+Success | false | true | -1 | 1 | null | 1
+;
+
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
index ac75811602bd6..f4f6bd8a12d79 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
@@ -1235,6 +1235,12 @@ public enum Cap {
*/
NO_PLAIN_STRINGS_IN_LITERALS,
+ /**
+ * Support for the mv_expand target attribute should be retained in its original position.
+ * see ES|QL: inconsistent column order #129000
+ */
+ FIX_MV_EXPAND_INCONSISTENT_COLUMN_ORDER,
+
/**
* (Re)Added EXPLAIN command
*/
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/ProjectAwayColumns.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/ProjectAwayColumns.java
index 26cfbf40eb7ff..887fb039a14cb 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/ProjectAwayColumns.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/ProjectAwayColumns.java
@@ -76,7 +76,19 @@ public PhysicalPlan apply(PhysicalPlan plan) {
// no need for projection when dealing with aggs
if (logicalFragment instanceof Aggregate == false) {
- List output = new ArrayList<>(requiredAttrBuilder.build());
+ // we should respect the order of the attributes
+ List output = new ArrayList<>();
+ for (Attribute attribute : logicalFragment.output()) {
+ if (requiredAttrBuilder.contains(attribute)) {
+ output.add(attribute);
+ requiredAttrBuilder.remove(attribute);
+ }
+ }
+ // requiredAttrBuilder should be empty unless the plan is inconsistent due to a bug.
+ // This can happen in case of remote ENRICH, see https://github.com/elastic/elasticsearch/issues/118531
+ // TODO: stop adding the remaining required attributes once remote ENRICH is fixed.
+ output.addAll(requiredAttrBuilder.build());
+
// if all the fields are filtered out, it's only the count that matters
// however until a proper fix (see https://github.com/elastic/elasticsearch/issues/98703)
// add a synthetic field (so it doesn't clash with the user defined one) to return a constant
diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java
index 79d58341783aa..a609a1e494e54 100644
--- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java
+++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java
@@ -128,6 +128,7 @@
import org.elasticsearch.xpack.esql.plan.physical.LimitExec;
import org.elasticsearch.xpack.esql.plan.physical.LocalSourceExec;
import org.elasticsearch.xpack.esql.plan.physical.LookupJoinExec;
+import org.elasticsearch.xpack.esql.plan.physical.MvExpandExec;
import org.elasticsearch.xpack.esql.plan.physical.PhysicalPlan;
import org.elasticsearch.xpack.esql.plan.physical.ProjectExec;
import org.elasticsearch.xpack.esql.plan.physical.TopNExec;
@@ -193,6 +194,7 @@
import static org.hamcrest.Matchers.closeTo;
import static org.hamcrest.Matchers.contains;
import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.hamcrest.Matchers.containsInRelativeOrder;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.equalTo;
@@ -625,16 +627,16 @@ public void testTripleExtractorPerField() {
}
/**
- * Expected
- * LimitExec[10000[INTEGER]]
- * \_AggregateExec[[],[AVG(salary{f}#14) AS x],FINAL]
- * \_AggregateExec[[],[AVG(salary{f}#14) AS x],PARTIAL]
- * \_FilterExec[ROUND(emp_no{f}#9) > 10[INTEGER]]
- * \_TopNExec[[Order[last_name{f}#13,ASC,LAST]],10[INTEGER]]
- * \_ExchangeExec[]
- * \_ProjectExec[[salary{f}#14, first_name{f}#10, emp_no{f}#9, last_name{f}#13]] -- project away _doc
- * \_FieldExtractExec[salary{f}#14, first_name{f}#10, emp_no{f}#9, last_n..] -- local field extraction
- * \_EsQueryExec[test], query[][_doc{f}#16], limit[10], sort[[last_name]]
+ *LimitExec[10000[INTEGER],8]
+ * \_AggregateExec[[],[SUM(salary{f}#13460,true[BOOLEAN]) AS x#13454],FINAL,[$$x$sum{r}#13466, $$x$seen{r}#13467],8]
+ * \_AggregateExec[[],[SUM(salary{f}#13460,true[BOOLEAN]) AS x#13454],INITIAL,[$$x$sum{r}#13466, $$x$seen{r}#13467],8]
+ * \_FilterExec[ROUND(emp_no{f}#13455) > 10[INTEGER]]
+ * \_TopNExec[[Order[last_name{f}#13459,ASC,LAST]],10[INTEGER],58]
+ * \_ExchangeExec[[emp_no{f}#13455, last_name{f}#13459, salary{f}#13460],false]
+ * \_ProjectExec[[emp_no{f}#13455, last_name{f}#13459, salary{f}#13460]] -- project away _doc
+ * \_FieldExtractExec[emp_no{f}#13455, last_name{f}#13459, salary{f}#1346..] <[],[]> -- local field extraction
+ * \_EsQueryExec[test], indexMode[standard], query[][_doc{f}#13482], limit[10],
+ * sort[[FieldSort[field=last_name{f}#13459, direction=ASC, nulls=LAST]]] estimatedRowSize[74]
*/
public void testExtractorForField() {
var plan = physicalPlan("""
@@ -658,7 +660,7 @@ public void testExtractorForField() {
var exchange = asRemoteExchange(topN.child());
var project = as(exchange.child(), ProjectExec.class);
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("salary", "emp_no", "last_name"));
+ assertThat(names(extract.attributesToExtract()), contains("emp_no", "last_name", "salary"));
var source = source(extract.child());
assertThat(source.limit(), is(topN.limit()));
assertThat(source.sorts(), is(fieldSorts(topN.order())));
@@ -2219,7 +2221,7 @@ public void testNoPushDownChangeCase() {
* ages{f}#6, last_name{f}#7, long_noidx{f}#13, salary{f}#8],false]
* \_ProjectExec[[_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gender{f}#5, hire_date{f}#10, job{f}#11, job.raw{f}#12, langu
* ages{f}#6, last_name{f}#7, long_noidx{f}#13, salary{f}#8]]
- * \_FieldExtractExec[_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gen..]<[],[]>
+ * \_FieldExtractExec[_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gen..]<[],[]>
* \_EsQueryExec[test], indexMode[standard], query[{"esql_single_value":{"field":"first_name","next":{"regexp":{"first_name":
* {"value":"foo*","flags_value":65791,"case_insensitive":true,"max_determinized_states":10000,"boost":0.0}}},
* "source":"TO_LOWER(first_name) RLIKE \"foo*\"@2:9"}}][_doc{f}#25], limit[1000], sort[] estimatedRowSize[332]
@@ -2340,10 +2342,10 @@ public void testPushDownUpperCaseChangeLike() {
* uages{f}#7, last_name{f}#8, long_noidx{f}#14, salary{f}#9],false]
* \_ProjectExec[[_meta_field{f}#10, emp_no{f}#4, first_name{f}#5, gender{f}#6, hire_date{f}#11, job{f}#12, job.raw{f}#13, lang
* uages{f}#7, last_name{f}#8, long_noidx{f}#14, salary{f}#9]]
- * \_FieldExtractExec[_meta_field{f}#10, gender{f}#6, hire_date{f}#11, jo..]<[],[]>
+ * \_FieldExtractExec[_meta_field{f}#10, gender{f}#6, hire_date{f}#11, jo..]<[],[]>
* \_LimitExec[1000[INTEGER]]
* \_FilterExec[LIKE(first_name{f}#5, "FOO*", true) OR IN(1[INTEGER],2[INTEGER],3[INTEGER],emp_no{f}#4 + 1[INTEGER])]
- * \_FieldExtractExec[first_name{f}#5, emp_no{f}#4]<[],[]>
+ * \_FieldExtractExec[first_name{f}#5, emp_no{f}#4]<[],[]>
* \_EsQueryExec[test], indexMode[standard], query[][_doc{f}#26], limit[], sort[] estimatedRowSize[332]
*/
public void testChangeCaseAsInsensitiveWildcardLikeNotPushedDown() {
@@ -2458,22 +2460,17 @@ public void testPushDownEvalFilter() {
/**
*
- * ProjectExec[[last_name{f}#21 AS name, first_name{f}#18 AS last_name, last_name{f}#21 AS first_name]]
- * \_TopNExec[[Order[last_name{f}#21,ASC,LAST]],10[INTEGER],0]
- * \_ExchangeExec[[last_name{f}#21, first_name{f}#18],false]
- * \_ProjectExec[[last_name{f}#21, first_name{f}#18]]
- * \_FieldExtractExec[last_name{f}#21, first_name{f}#18][]
- * \_EsQueryExec[test], indexMode[standard], query[{
- * "bool":{"must":[
- * {"esql_single_value":{
- * "field":"last_name",
- * "next":{"range":{"last_name":{"gt":"B","boost":1.0}}},
- * "source":"first_name > \"B\"@3:9"
- * }},
- * {"exists":{"field":"first_name","boost":1.0}}
- * ],"boost":1.0}}][_doc{f}#40], limit[10], sort[[
- * FieldSort[field=last_name{f}#21, direction=ASC, nulls=LAST]
- * ]] estimatedRowSize[116]
+ * ProjectExec[[last_name{f}#13858 AS name#13841, first_name{f}#13855 AS last_name#13844, last_name{f}#13858 AS first_name#13
+ * 847]]
+ * \_TopNExec[[Order[last_name{f}#13858,ASC,LAST]],10[INTEGER],100]
+ * \_ExchangeExec[[first_name{f}#13855, last_name{f}#13858],false]
+ * \_ProjectExec[[first_name{f}#13855, last_name{f}#13858]]
+ * \_FieldExtractExec[first_name{f}#13855, last_name{f}#13858]<[],[]>
+ * \_EsQueryExec[test], indexMode[standard], query[
+ * {"bool":{"must":[{"esql_single_value":{"field":"last_name","next":
+ * {"range":{"last_name":{"gt":"B","boost":0.0}}},"source":"first_name > \"B\"@3:9"}},
+ * {"exists":{"field":"first_name","boost":0.0}}],"boost":1.0}}
+ * ][_doc{f}#13879], limit[10], sort[[FieldSort[field=last_name{f}#13858, direction=ASC, nulls=LAST]]] estimatedRowSize[116]
*
*/
public void testPushDownEvalSwapFilter() {
@@ -2494,7 +2491,7 @@ public void testPushDownEvalSwapFilter() {
var extract = as(project.child(), FieldExtractExec.class);
assertThat(
extract.attributesToExtract().stream().map(Attribute::name).collect(Collectors.toList()),
- contains("last_name", "first_name")
+ contains("first_name", "last_name")
);
// Now verify the correct Lucene push-down of both the filter and the sort
@@ -2607,7 +2604,7 @@ public void testDissect() {
* uages{f}#7, last_name{f}#8, long_noidx{f}#14, salary{f}#9, _index{m}#2],false]
* \_ProjectExec[[_meta_field{f}#10, emp_no{f}#4, first_name{f}#5, gender{f}#6, hire_date{f}#11, job{f}#12, job.raw{f}#13, lang
* uages{f}#7, last_name{f}#8, long_noidx{f}#14, salary{f}#9, _index{m}#2]]
- * \_FieldExtractExec[_meta_field{f}#10, emp_no{f}#4, first_name{f}#5, ge..]<[],[]>
+ * \_FieldExtractExec[_meta_field{f}#10, emp_no{f}#4, first_name{f}#5, ge..]<[],[]>
* \_EsQueryExec[test], indexMode[standard], query[{"wildcard":{"_index":{"wildcard":"test*","boost":0.0}}}][_doc{f}#27],
* limit[1000], sort[] estimatedRowSize[382]
*
@@ -3176,6 +3173,56 @@ public void testProjectAwayAllColumnsWhenOnlyTheCountMattersInStats() {
assertThat(Expressions.names(esQuery.attrs()), contains("_doc"));
}
+ /**
+ * LimitExec[1000[INTEGER],336]
+ * \_MvExpandExec[foo_1{r}#4236,foo_1{r}#4253]
+ * \_TopNExec[[Order[emp_no{f}#4242,ASC,LAST]],1000[INTEGER],336]
+ * \_ExchangeExec[[_meta_field{f}#4248, emp_no{f}#4242, first_name{f}#4243, gender{f}#4244, hire_date{f}#4249, job{f}#4250, job.
+ * raw{f}#4251, languages{f}#4245, last_name{f}#4246, long_noidx{f}#4252, salary{f}#4247, foo_1{r}#4236, foo_2{r}#4238],
+ * false]
+ * \_ProjectExec[[_meta_field{f}#4248, emp_no{f}#4242, first_name{f}#4243, gender{f}#4244, hire_date{f}#4249, job{f}#4250, job.
+ * raw{f}#4251, languages{f}#4245, last_name{f}#4246, long_noidx{f}#4252, salary{f}#4247, foo_1{r}#4236, foo_2{r}#4238]]
+ * \_FieldExtractExec[_meta_field{f}#4248, emp_no{f}#4242, first_name{f}#..]<[],[]>
+ * \_EvalExec[[1[INTEGER] AS foo_1#4236, 1[INTEGER] AS foo_2#4238]]
+ * \_EsQueryExec[test], indexMode[standard], query[][_doc{f}#4268], limit[1000], sort[[FieldSort[field=emp_no{f}#4242,
+ * direction=ASC, nulls=LAST]]] estimatedRowSize[352]
+ */
+ public void testProjectAwayMvExpandColumnOrder() {
+ var plan = optimizedPlan(physicalPlan("""
+ from test
+ | eval foo_1 = 1, foo_2 = 1
+ | sort emp_no
+ | mv_expand foo_1
+ """));
+ var limit = as(plan, LimitExec.class);
+ var mvExpand = as(limit.child(), MvExpandExec.class);
+ var topN = as(mvExpand.child(), TopNExec.class);
+ var exchange = as(topN.child(), ExchangeExec.class);
+ var project = as(exchange.child(), ProjectExec.class);
+
+ assertThat(
+ Expressions.names(project.projections()),
+ containsInRelativeOrder(
+ "_meta_field",
+ "emp_no",
+ "first_name",
+ "gender",
+ "hire_date",
+ "job",
+ "job.raw",
+ "languages",
+ "last_name",
+ "long_noidx",
+ "salary",
+ "foo_1",
+ "foo_2"
+ )
+ );
+ var fieldExtract = as(project.child(), FieldExtractExec.class);
+ var eval = as(fieldExtract.child(), EvalExec.class);
+ EsQueryExec esQuery = as(eval.child(), EsQueryExec.class);
+ }
+
/**
* ProjectExec[[a{r}#5]]
* \_EvalExec[[__a_SUM@81823521{r}#15 / __a_COUNT@31645621{r}#16 AS a]]
@@ -5674,16 +5721,15 @@ public void testPushTopNWithFilterToSource() {
}
/**
- * ProjectExec[[abbrev{f}#12321, name{f}#12322, location{f}#12325, country{f}#12326, city{f}#12327]]
- * \_TopNExec[[Order[abbrev{f}#12321,ASC,LAST]],5[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#12321, name{f}#12322, location{f}#12325, country{f}#12326, city{f}#12327],false]
- * \_ProjectExec[[abbrev{f}#12321, name{f}#12322, location{f}#12325, country{f}#12326, city{f}#12327]]
- * \_FieldExtractExec[abbrev{f}#12321, name{f}#12322, location{f}#12325, ..][]
+ * ProjectExec[[abbrev{f}#4474, name{f}#4475, location{f}#4478, country{f}#4479, city{f}#4480]]
+ * \_TopNExec[[Order[abbrev{f}#4474,ASC,LAST]],5[INTEGER],221]
+ * \_ExchangeExec[[abbrev{f}#4474, city{f}#4480, country{f}#4479, location{f}#4478, name{f}#4475],false]
+ * \_ProjectExec[[abbrev{f}#4474, city{f}#4480, country{f}#4479, location{f}#4478, name{f}#4475]]
+ * \_FieldExtractExec[abbrev{f}#4474, city{f}#4480, country{f}#4479, loca..]<[],[]>
* \_EsQueryExec[airports],
- * indexMode[standard],
- * query[][_doc{f}#12337],
- * limit[5],
- * sort[[FieldSort[field=abbrev{f}#12321, direction=ASC, nulls=LAST]]] estimatedRowSize[237]
+ * indexMode[standard],
+ * query[][_doc{f}#4490],
+ * limit[5], sort[[FieldSort[field=abbrev{f}#4474, direction=ASC, nulls=LAST]]] estimatedRowSize[237]
*/
public void testPushTopNKeywordToSource() {
var optimized = optimizedPlan(physicalPlan("""
@@ -5698,9 +5744,9 @@ public void testPushTopNKeywordToSource() {
var exchange = asRemoteExchange(topN.child());
project = as(exchange.child(), ProjectExec.class);
- assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city"));
+ assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name"));
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "location", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "location", "name"));
var source = source(extract.child());
assertThat(source.limit(), is(topN.limit()));
assertThat(source.sorts(), is(fieldSorts(topN.order())));
@@ -5716,13 +5762,13 @@ public void testPushTopNKeywordToSource() {
/**
*
- * ProjectExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18, abbrev{f}#12 AS code]]
- * \_TopNExec[[Order[abbrev{f}#12,ASC,LAST]],5[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18],false]
- * \_ProjectExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18]]
- * \_FieldExtractExec[abbrev{f}#12, name{f}#13, location{f}#16, country{f..][]
- * \_EsQueryExec[airports], indexMode[standard], query[][_doc{f}#29], limit[5],
- * sort[[FieldSort[field=abbrev{f}#12, direction=ASC, nulls=LAST]]] estimatedRowSize[237]
+ * ProjectExec[[abbrev{f}#7828, name{f}#7829, location{f}#7832, country{f}#7833, city{f}#7834, abbrev{f}#7828 AS code#7820]]
+ * \_TopNExec[[Order[abbrev{f}#7828,ASC,LAST]],5[INTEGER],221]
+ * \_ExchangeExec[[abbrev{f}#7828, city{f}#7834, country{f}#7833, location{f}#7832, name{f}#7829],false]
+ * \_ProjectExec[[abbrev{f}#7828, city{f}#7834, country{f}#7833, location{f}#7832, name{f}#7829]]
+ * \_FieldExtractExec[abbrev{f}#7828, city{f}#7834, country{f}#7833, loca..]<[],[]>
+ * \_EsQueryExec[airports], indexMode[standard], query[][_doc{f}#7845], limit[5],
+ * sort[[FieldSort[field=abbrev{f}#7828, direction=ASC, nulls=LAST]]] estimatedRowSize[237]
*
*/
public void testPushTopNAliasedKeywordToSource() {
@@ -5740,9 +5786,9 @@ public void testPushTopNAliasedKeywordToSource() {
var exchange = asRemoteExchange(topN.child());
project = as(exchange.child(), ProjectExec.class);
- assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city"));
+ assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name"));
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "location", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "location", "name"));
var source = source(extract.child());
assertThat(source.limit(), is(topN.limit()));
assertThat(source.sorts(), is(fieldSorts(topN.order())));
@@ -5757,19 +5803,19 @@ public void testPushTopNAliasedKeywordToSource() {
}
/**
- * ProjectExec[[abbrev{f}#11, name{f}#12, location{f}#15, country{f}#16, city{f}#17]]
- * \_TopNExec[[Order[distance{r}#4,ASC,LAST]],5[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#11, name{f}#12, location{f}#15, country{f}#16, city{f}#17, distance{r}#4],false]
- * \_ProjectExec[[abbrev{f}#11, name{f}#12, location{f}#15, country{f}#16, city{f}#17, distance{r}#4]]
- * \_FieldExtractExec[abbrev{f}#11, name{f}#12, country{f}#16, city{f}#17][]
- * \_EvalExec[[STDISTANCE(location{f}#15,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT])
- * AS distance]]
- * \_FieldExtractExec[location{f}#15][]
+ * ProjectExec[[abbrev{f}#7283, name{f}#7284, location{f}#7287, country{f}#7288, city{f}#7289]]
+ * \_TopNExec[[Order[distance{r}#7276,ASC,LAST]],5[INTEGER],229]
+ * \_ExchangeExec[[abbrev{f}#7283, city{f}#7289, country{f}#7288, location{f}#7287, name{f}#7284, distance{r}#7276],false]
+ * \_ProjectExec[[abbrev{f}#7283, city{f}#7289, country{f}#7288, location{f}#7287, name{f}#7284, distance{r}#7276]]
+ * \_FieldExtractExec[abbrev{f}#7283, city{f}#7289, country{f}#7288, name..]<[],[]>
+ * \_EvalExec[[STDISTANCE(location{f}#7287,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan
+ * ce#7276]]
+ * \_FieldExtractExec[location{f}#7287]<[],[]>
* \_EsQueryExec[airports],
- * indexMode[standard],
- * query[][_doc{f}#28],
- * limit[5],
- * sort[[GeoDistanceSort[field=location{f}#15, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245]
+ * indexMode[standard],
+ * query[][_doc{f}#7300],
+ * limit[5],
+ * sort[[GeoDistanceSort[field=location{f}#7287, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245]
*/
public void testPushTopNDistanceToSource() {
var optimized = optimizedPlan(physicalPlan("""
@@ -5785,9 +5831,9 @@ public void testPushTopNDistanceToSource() {
var exchange = asRemoteExchange(topN.child());
project = as(exchange.child(), ProjectExec.class);
- assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "distance"));
+ assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "distance"));
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name"));
var evalExec = as(extract.child(), EvalExec.class);
var alias = as(evalExec.fields().get(0), Alias.class);
assertThat(alias.name(), is("distance"));
@@ -5814,20 +5860,19 @@ public void testPushTopNDistanceToSource() {
}
/**
- * ProjectExec[[abbrev{f}#8, name{f}#9, location{f}#12, country{f}#13, city{f}#14]]
- * \_TopNExec[[Order[$$order_by$0$0{r}#16,ASC,LAST]],5[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#8, name{f}#9, location{f}#12, country{f}#13, city{f}#14, $$order_by$0$0{r}#16],false]
- * \_ProjectExec[[abbrev{f}#8, name{f}#9, location{f}#12, country{f}#13, city{f}#14, $$order_by$0$0{r}#16]]
- * \_FieldExtractExec[abbrev{f}#8, name{f}#9, country{f}#13, city{f}#14][]
- * \_EvalExec[[
- * STDISTANCE(location{f}#12,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS $$order_by$0$0
- * ]]
- * \_FieldExtractExec[location{f}#12][]
+ *ProjectExec[[abbrev{f}#5258, name{f}#5259, location{f}#5262, country{f}#5263, city{f}#5264]]
+ * \_TopNExec[[Order[$$order_by$0$0{r}#5266,ASC,LAST]],5[INTEGER],229]
+ * \_ExchangeExec[[abbrev{f}#5258, city{f}#5264, country{f}#5263, location{f}#5262, name{f}#5259, $$order_by$0$0{r}#5266],false]
+ * \_ProjectExec[[abbrev{f}#5258, city{f}#5264, country{f}#5263, location{f}#5262, name{f}#5259, $$order_by$0$0{r}#5266]]
+ * \_FieldExtractExec[abbrev{f}#5258, city{f}#5264, country{f}#5263, name..]<[],[]>
+ * \_EvalExec[[STDISTANCE(location{f}#5262,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS $$orde
+ * r_by$0$0#5266]]
+ * \_FieldExtractExec[location{f}#5262]<[],[]>
* \_EsQueryExec[airports],
- * indexMode[standard],
- * query[][_doc{f}#26],
- * limit[5],
- * sort[[GeoDistanceSort[field=location{f}#12, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245]
+ * indexMode[standard],
+ * query[][_doc{f}#5276],
+ * limit[5],
+ * sort[[GeoDistanceSort[field=location{f}#5262, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245]
*/
public void testPushTopNInlineDistanceToSource() {
var optimized = optimizedPlan(physicalPlan("""
@@ -5847,15 +5892,15 @@ public void testPushTopNInlineDistanceToSource() {
names(project.projections()),
contains(
equalTo("abbrev"),
- equalTo("name"),
- equalTo("location"),
- equalTo("country"),
equalTo("city"),
+ equalTo("country"),
+ equalTo("location"),
+ equalTo("name"),
startsWith("$$order_by$0$")
)
);
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name"));
var evalExec = as(extract.child(), EvalExec.class);
var alias = as(evalExec.fields().get(0), Alias.class);
assertThat(alias.name(), startsWith("$$order_by$0$"));
@@ -5884,14 +5929,14 @@ public void testPushTopNInlineDistanceToSource() {
/**
*
- * ProjectExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18]]
- * \_TopNExec[[Order[distance{r}#4,ASC,LAST]],5[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18, distance{r}#4],false]
- * \_ProjectExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18, distance{r}#4]]
- * \_FieldExtractExec[abbrev{f}#12, name{f}#13, country{f}#17, city{f}#18][]
- * \_EvalExec[[STDISTANCE(location{f}#16,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distance
- * ]]
- * \_FieldExtractExec[location{f}#16][]
+ * ProjectExec[[abbrev{f}#361, name{f}#362, location{f}#365, country{f}#366, city{f}#367]]
+ * \_TopNExec[[Order[distance{r}#353,ASC,LAST]],5[INTEGER],229]
+ * \_ExchangeExec[[abbrev{f}#361, city{f}#367, country{f}#366, location{f}#365, name{f}#362, distance{r}#353],false]
+ * \_ProjectExec[[abbrev{f}#361, city{f}#367, country{f}#366, location{f}#365, name{f}#362, distance{r}#353]]
+ * \_FieldExtractExec[abbrev{f}#361, city{f}#367, country{f}#366, name{f}..]<[],[]>
+ * \_EvalExec[[STDISTANCE(location{f}#365,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distanc
+ * e#353]]
+ * \_FieldExtractExec[location{f}#365]<[],[]>
* \_EsQueryExec[airports], indexMode[standard], query[
* {
* "geo_shape":{
@@ -5904,7 +5949,7 @@ public void testPushTopNInlineDistanceToSource() {
* }
* }
* }
- * }][_doc{f}#29], limit[5], sort[[GeoDistanceSort[field=location{f}#16, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245]
+ * ][_doc{f}#378], limit[5], sort[[GeoDistanceSort[field=location{f}#365, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245]
*
*/
public void testPushTopNDistanceWithFilterToSource() {
@@ -5922,9 +5967,9 @@ public void testPushTopNDistanceWithFilterToSource() {
var exchange = asRemoteExchange(topN.child());
project = as(exchange.child(), ProjectExec.class);
- assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "distance"));
+ assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "distance"));
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name"));
var evalExec = as(extract.child(), EvalExec.class);
var alias = as(evalExec.fields().get(0), Alias.class);
assertThat(alias.name(), is("distance"));
@@ -5960,48 +6005,25 @@ public void testPushTopNDistanceWithFilterToSource() {
/**
*
- * ProjectExec[[abbrev{f}#14, name{f}#15, location{f}#18, country{f}#19, city{f}#20]]
- * \_TopNExec[[Order[distance{r}#4,ASC,LAST]],5[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#14, name{f}#15, location{f}#18, country{f}#19, city{f}#20, distance{r}#4],false]
- * \_ProjectExec[[abbrev{f}#14, name{f}#15, location{f}#18, country{f}#19, city{f}#20, distance{r}#4]]
- * \_FieldExtractExec[abbrev{f}#14, name{f}#15, country{f}#19, city{f}#20][]
- * \_EvalExec[[STDISTANCE(location{f}#18,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT])
- * AS distance]]
- * \_FieldExtractExec[location{f}#18][]
- * \_EsQueryExec[airports], indexMode[standard], query[{
- * "bool":{
- * "filter":[
- * {
- * "esql_single_value":{
- * "field":"scalerank",
- * "next":{"range":{"scalerank":{"lt":6,"boost":1.0}}},
- * "source":"scalerank lt 6@3:31"
- * }
- * },
- * {
- * "bool":{
- * "must":[
- * {"geo_shape":{
- * "location":{
- * "relation":"INTERSECTS",
- * "shape":{"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}
- * }
- * }},
- * {"geo_shape":{
- * "location":{
- * "relation":"DISJOINT",
- * "shape":{"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}
- * }
- * }}
- * ],
- * "boost":1.0
- * }
- * }
- * ],
- * "boost":1.0
- * }}][_doc{f}#31], limit[5], sort[[
- * GeoDistanceSort[field=location{f}#18, direction=ASC, lat=55.673, lon=12.565]
- * ]] estimatedRowSize[245]
+ * ProjectExec[[abbrev{f}#6367, name{f}#6368, location{f}#6371, country{f}#6372, city{f}#6373]]
+ * \_TopNExec[[Order[distance{r}#6357,ASC,LAST]],5[INTEGER],229]
+ * \_ExchangeExec[[abbrev{f}#6367, city{f}#6373, country{f}#6372, location{f}#6371, name{f}#6368, distance{r}#6357],false]
+ * \_ProjectExec[[abbrev{f}#6367, city{f}#6373, country{f}#6372, location{f}#6371, name{f}#6368, distance{r}#6357]]
+ * \_FieldExtractExec[abbrev{f}#6367, city{f}#6373, country{f}#6372, name..]<[],[]>
+ * \_EvalExec[[STDISTANCE(location{f}#6371,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan
+ * ce#6357]]
+ * \_FieldExtractExec[location{f}#6371]<[],[]>
+ * \_EsQueryExec[airports], indexMode[standard], query[
+ * {"bool":{"filter":[{"esql_single_value":{"field":"scalerank","next":{"range":
+ * {"scalerank":{"lt":6,"boost":0.0}}},"source":"scalerank < 6@3:31"}},
+ * {"bool":{"must":[{"geo_shape":
+ * {"location":{"relation":"INTERSECTS","shape":
+ * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}},
+ * {"geo_shape":{"location":{"relation":"DISJOINT","shape":
+ * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}}]
+ * ,"boost":1.0}}],"boost":1.0}}
+ * ][_doc{f}#6384], limit[5], sort[
+ * [GeoDistanceSort[field=location{f}#6371, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245]
*
*/
public void testPushTopNDistanceWithCompoundFilterToSource() {
@@ -6019,9 +6041,9 @@ public void testPushTopNDistanceWithCompoundFilterToSource() {
var exchange = asRemoteExchange(topN.child());
project = as(exchange.child(), ProjectExec.class);
- assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "distance"));
+ assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "distance"));
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name"));
var evalExec = as(extract.child(), EvalExec.class);
var alias = as(evalExec.fields().get(0), Alias.class);
assertThat(alias.name(), is("distance"));
@@ -6059,35 +6081,28 @@ public void testPushTopNDistanceWithCompoundFilterToSource() {
/**
* Tests that multiple sorts, including distance and a field, are pushed down to the source.
*
- * ProjectExec[[abbrev{f}#25, name{f}#26, location{f}#29, country{f}#30, city{f}#31, scalerank{f}#27, scale{r}#7]]
- * \_TopNExec[[
- * Order[distance{r}#4,ASC,LAST],
- * Order[scalerank{f}#27,ASC,LAST],
- * Order[scale{r}#7,DESC,FIRST],
- * Order[loc{r}#10,DESC,FIRST]
- * ],5[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#25, name{f}#26, location{f}#29, country{f}#30, city{f}#31, scalerank{f}#27, scale{r}#7,
- * distance{r}#4, loc{r}#10],false]
- * \_ProjectExec[[abbrev{f}#25, name{f}#26, location{f}#29, country{f}#30, city{f}#31, scalerank{f}#27, scale{r}#7,
- * distance{r}#4, loc{r}#10]]
- * \_FieldExtractExec[abbrev{f}#25, name{f}#26, country{f}#30, city{f}#31][]
- * \_EvalExec[[
- * STDISTANCE(location{f}#29,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distance,
- * 10[INTEGER] - scalerank{f}#27 AS scale, TOSTRING(location{f}#29) AS loc
- * ]]
- * \_FieldExtractExec[location{f}#29, scalerank{f}#27][]
- * \_EsQueryExec[airports], indexMode[standard], query[{
- * "bool":{
- * "filter":[
- * {"esql_single_value":{"field":"scalerank","next":{...},"source":"scalerank < 6@3:31"}},
- * {"bool":{
- * "must":[
- * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":{...}}}},
- * {"geo_shape":{"location":{"relation":"DISJOINT","shape":{...}}}}
- * ],"boost":1.0}}],"boost":1.0}}][_doc{f}#44], limit[5], sort[[
- * GeoDistanceSort[field=location{f}#29, direction=ASC, lat=55.673, lon=12.565],
- * FieldSort[field=scalerank{f}#27, direction=ASC, nulls=LAST]
- * ]] estimatedRowSize[303]
+ * ProjectExec[[abbrev{f}#7429, name{f}#7430, location{f}#7433, country{f}#7434, city{f}#7435, scalerank{f}#7431, scale{r}#74
+ * 11]]
+ * \_TopNExec[[Order[distance{r}#7408,ASC,LAST], Order[scalerank{f}#7431,ASC,LAST], Order[scale{r}#7411,DESC,FIRST], Order[l
+ * oc{r}#7414,DESC,FIRST]],5[INTEGER],287]
+ * \_ExchangeExec[[abbrev{f}#7429, city{f}#7435, country{f}#7434, location{f}#7433, name{f}#7430, scalerank{f}#7431, distance{r}
+ * #7408, scale{r}#7411, loc{r}#7414],false]
+ * \_ProjectExec[[abbrev{f}#7429, city{f}#7435, country{f}#7434, location{f}#7433, name{f}#7430, scalerank{f}#7431, distance{r}
+ * #7408, scale{r}#7411, loc{r}#7414]]
+ * \_FieldExtractExec[abbrev{f}#7429, city{f}#7435, country{f}#7434, name..]<[],[]>
+ * \_EvalExec[[STDISTANCE(location{f}#7433,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan
+ * ce#7408, 10[INTEGER] - scalerank{f}#7431 AS scale#7411, TOSTRING(location{f}#7433) AS loc#7414]]
+ * \_FieldExtractExec[location{f}#7433, scalerank{f}#7431]<[],[]>
+ * \_EsQueryExec[airports], indexMode[standard], query[
+ * {"bool":{"filter":[{"esql_single_value":{"field":"scalerank","next":
+ * {"range":{"scalerank":{"lt":6,"boost":0.0}}},"source":"scalerank < 6@3:31"}},
+ * {"bool":{"must":[{"geo_shape":{"location":{"relation":"INTERSECTS","shape":
+ * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}},
+ * {"geo_shape":{"location":{"relation":"DISJOINT","shape":
+ * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}}],
+ * "boost":1.0}}],"boost":1.0}}][_doc{f}#7448], limit[5], sort[
+ * [GeoDistanceSort[field=location{f}#7433, direction=ASC, lat=55.673, lon=12.565],
+ * FieldSort[field=scalerank{f}#7431, direction=ASC, nulls=LAST]]] estimatedRowSize[303]
*
*/
public void testPushTopNDistanceAndPushableFieldWithCompoundFilterToSource() {
@@ -6108,10 +6123,10 @@ public void testPushTopNDistanceAndPushableFieldWithCompoundFilterToSource() {
project = as(exchange.child(), ProjectExec.class);
assertThat(
names(project.projections()),
- contains("abbrev", "name", "location", "country", "city", "scalerank", "scale", "distance", "loc")
+ contains("abbrev", "city", "country", "location", "name", "scalerank", "distance", "scale", "loc")
);
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name"));
var evalExec = as(extract.child(), EvalExec.class);
var alias = as(evalExec.fields().get(0), Alias.class);
assertThat(alias.name(), is("distance"));
@@ -6153,26 +6168,30 @@ public void testPushTopNDistanceAndPushableFieldWithCompoundFilterToSource() {
/**
* This test shows that if the filter contains a predicate on the same field that is sorted, we cannot push down the sort.
*
- * ProjectExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scalerank{f}#25 AS scale]]
- * \_TopNExec[[Order[distance{r}#4,ASC,LAST], Order[scalerank{f}#25,ASC,LAST]],5[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scalerank{f}#25, distance{r}#4],false]
- * \_ProjectExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scalerank{f}#25, distance{r}#4]]
- * \_FieldExtractExec[abbrev{f}#23, name{f}#24, country{f}#28, city{f}#29][]
- * \_TopNExec[[Order[distance{r}#4,ASC,LAST], Order[scalerank{f}#25,ASC,LAST]],5[INTEGER],208]
- * \_FieldExtractExec[scalerank{f}#25][]
- * \_FilterExec[SUBSTRING(position{r}#7,1[INTEGER],5[INTEGER]) == [50 4f 49 4e 54][KEYWORD]]
- * \_EvalExec[[
- * STDISTANCE(location{f}#27,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distance,
- * TOSTRING(location{f}#27) AS position
- * ]]
- * \_FieldExtractExec[location{f}#27][]
- * \_EsQueryExec[airports], indexMode[standard], query[{
- * "bool":{"filter":[
- * {"esql_single_value":{"field":"scalerank","next":{"range":{"scalerank":{"lt":6,"boost":1.0}}},"source":...}},
- * {"bool":{"must":[
- * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":{...}}}},
- * {"geo_shape":{"location":{"relation":"DISJOINT","shape":{...}}}}
- * ],"boost":1.0}}],"boost":1.0}}][_doc{f}#42], limit[], sort[] estimatedRowSize[87]
+ * ProjectExec[[abbrev{f}#4856, name{f}#4857, location{f}#4860, country{f}#4861, city{f}#4862, scalerank{f}#4858 AS scale#484
+ * 3]]
+ * \_TopNExec[[Order[distance{r}#4837,ASC,LAST], Order[scalerank{f}#4858,ASC,LAST]],5[INTEGER],233]
+ * \_ExchangeExec[[abbrev{f}#4856, city{f}#4862, country{f}#4861, location{f}#4860, name{f}#4857, scalerank{f}#4858, distance{r}
+ * #4837],false]
+ * \_ProjectExec[[abbrev{f}#4856, city{f}#4862, country{f}#4861, location{f}#4860, name{f}#4857, scalerank{f}#4858, distance{r}
+ * #4837]]
+ * \_FieldExtractExec[abbrev{f}#4856, city{f}#4862, country{f}#4861, name..]<[],[]>
+ * \_TopNExec[[Order[distance{r}#4837,ASC,LAST], Order[scalerank{f}#4858,ASC,LAST]],5[INTEGER],303]
+ * \_FieldExtractExec[scalerank{f}#4858]<[],[]>
+ * \_FilterExec[SUBSTRING(position{r}#4840,1[INTEGER],5[INTEGER]) == POINT[KEYWORD]]
+ * \_EvalExec[[STDISTANCE(location{f}#4860,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS
+ * distance#4837, TOSTRING(location{f}#4860) AS position#4840]]
+ * \_FieldExtractExec[location{f}#4860]<[],[]>
+ * \_EsQueryExec[airports], indexMode[standard], query[
+ * {"bool":{"filter":[
+ * {"esql_single_value":
+ * {"field":"scalerank","next":{"range":{"scalerank":{"lt":6,"boost":0.0}}},"source":"scale < 6@3:93"}},
+ * {"bool":{"must":[
+ * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":
+ * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}},
+ * {"geo_shape":{"location":{"relation":"DISJOINT","shape":
+ * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}}
+ * ],"boost":1.0}}],"boost":1.0}}][_doc{f}#4875], limit[], sort[] estimatedRowSize[87]
*
*/
public void testPushTopNDistanceAndNonPushableEvalWithCompoundFilterToSource() {
@@ -6191,9 +6210,9 @@ public void testPushTopNDistanceAndNonPushableEvalWithCompoundFilterToSource() {
var exchange = asRemoteExchange(topN.child());
project = as(exchange.child(), ProjectExec.class);
- assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "scalerank", "distance"));
+ assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "scalerank", "distance"));
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name"));
var topNChild = as(extract.child(), TopNExec.class);
extract = as(topNChild.child(), FieldExtractExec.class);
assertThat(names(extract.attributesToExtract()), contains("scalerank"));
@@ -6228,27 +6247,25 @@ public void testPushTopNDistanceAndNonPushableEvalWithCompoundFilterToSource() {
/**
* This test shows that if the filter contains a predicate on the same field that is sorted, we cannot push down the sort.
*
- * ProjectExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scale{r}#10]]
- * \_TopNExec[[Order[distance{r}#4,ASC,LAST], Order[scale{r}#10,ASC,LAST]],5[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scale{r}#10, distance{r}#4],false]
- * \_ProjectExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scale{r}#10, distance{r}#4]]
- * \_FieldExtractExec[abbrev{f}#23, name{f}#24, country{f}#28, city{f}#29][]
- * \_TopNExec[[Order[distance{r}#4,ASC,LAST], Order[scale{r}#10,ASC,LAST]],5[INTEGER],208]
- * \_FilterExec[
- * SUBSTRING(position{r}#7,1[INTEGER],5[INTEGER]) == [50 4f 49 4e 54][KEYWORD]
- * AND scale{r}#10 > 3[INTEGER]
- * ]
- * \_EvalExec[[
- * STDISTANCE(location{f}#27,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distance,
- * TOSTRING(location{f}#27) AS position,
- * 10[INTEGER] - scalerank{f}#25 AS scale
- * ]]
- * \_FieldExtractExec[location{f}#27, scalerank{f}#25][]
- * \_EsQueryExec[airports], indexMode[standard], query[{
- * "bool":{"must":[
- * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":{...}}}},
- * {"geo_shape":{"location":{"relation":"DISJOINT","shape":{...}}}}
- * ],"boost":1.0}}][_doc{f}#42], limit[], sort[] estimatedRowSize[91]
+ *ProjectExec[[abbrev{f}#1447, name{f}#1448, location{f}#1451, country{f}#1452, city{f}#1453, scalerank{r}#1434]]
+ * \_TopNExec[[Order[distance{r}#1428,ASC,LAST], Order[scalerank{r}#1434,ASC,LAST]],5[INTEGER],233]
+ * \_ExchangeExec[[abbrev{f}#1447, city{f}#1453, country{f}#1452, location{f}#1451, name{f}#1448, distance{r}#1428, scalerank{r}
+ * #1434],false]
+ * \_ProjectExec[[abbrev{f}#1447, city{f}#1453, country{f}#1452, location{f}#1451, name{f}#1448, distance{r}#1428, scalerank{r}
+ * #1434]]
+ * \_FieldExtractExec[abbrev{f}#1447, city{f}#1453, country{f}#1452, name..]<[],[]>
+ * \_TopNExec[[Order[distance{r}#1428,ASC,LAST], Order[scalerank{r}#1434,ASC,LAST]],5[INTEGER],303]
+ * \_FilterExec[SUBSTRING(position{r}#1431,1[INTEGER],5[INTEGER]) == POINT[KEYWORD] AND scalerank{r}#1434 > 3[INTEGER]]
+ * \_EvalExec[[STDISTANCE(location{f}#1451,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan
+ * ce#1428, TOSTRING(location{f}#1451) AS position#1431, 10[INTEGER] - scalerank{f}#1449 AS scalerank#1434]]
+ * \_FieldExtractExec[location{f}#1451, scalerank{f}#1449]<[],[]>
+ * \_EsQueryExec[airports], indexMode[standard], query[
+ * {"bool":{"must":[
+ * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":
+ * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}},
+ * {"geo_shape":{"location":{"relation":"DISJOINT","shape":
+ * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}}
+ * ],"boost":1.0}}][_doc{f}#1466], limit[], sort[] estimatedRowSize[91]
*
*/
public void testPushTopNDistanceAndNonPushableEvalsWithCompoundFilterToSource() {
@@ -6267,9 +6284,9 @@ public void testPushTopNDistanceAndNonPushableEvalsWithCompoundFilterToSource()
var exchange = asRemoteExchange(topN.child());
project = as(exchange.child(), ProjectExec.class);
- assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "scalerank", "distance"));
+ assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "distance", "scalerank"));
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name"));
var topNChild = as(extract.child(), TopNExec.class);
var filter = as(topNChild.child(), FilterExec.class);
assertThat(filter.condition(), isA(And.class));
@@ -6344,9 +6361,9 @@ public void testPushTopNDistanceWithCompoundFilterToSourceAndDisjunctiveNonPusha
var exchange = asRemoteExchange(topN.child());
project = as(exchange.child(), ProjectExec.class);
- assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "scalerank", "distance"));
+ assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "scalerank", "distance"));
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name"));
var topNChild = as(extract.child(), TopNExec.class);
var filter = as(topNChild.child(), FilterExec.class);
assertThat(filter.condition(), isA(Or.class));
@@ -6373,28 +6390,29 @@ public void testPushTopNDistanceWithCompoundFilterToSourceAndDisjunctiveNonPusha
/**
*
- * ProjectExec[[abbrev{f}#15, name{f}#16, location{f}#19, country{f}#20, city{f}#21]]
- * \_TopNExec[[Order[scalerank{f}#17,ASC,LAST], Order[distance{r}#4,ASC,LAST]],15[INTEGER],0]
- * \_ExchangeExec[[abbrev{f}#15, name{f}#16, location{f}#19, country{f}#20, city{f}#21, scalerank{f}#17, distance{r}#4],false]
- * \_ProjectExec[[abbrev{f}#15, name{f}#16, location{f}#19, country{f}#20, city{f}#21, scalerank{f}#17, distance{r}#4]]
- * \_FieldExtractExec[abbrev{f}#15, name{f}#16, country{f}#20, city{f}#21, ..][]
- * \_EvalExec[[STDISTANCE(location{f}#19,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT])
- * AS distance]]
- * \_FieldExtractExec[location{f}#19][]
- * \_EsQueryExec[airports], indexMode[standard], query[{
- * "bool":{
- * "filter":[
- * {"esql_single_value":{"field":"scalerank",...,"source":"scalerank lt 6@3:31"}},
- * {"bool":{"must":[
- * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":{...}}}},
- * {"geo_shape":{"location":{"relation":"DISJOINT","shape":{...}}}}
- * ],"boost":1.0}}
- * ],"boost":1.0
- * }
- * }][_doc{f}#32], limit[], sort[[
- * FieldSort[field=scalerank{f}#17, direction=ASC, nulls=LAST],
- * GeoDistanceSort[field=location{f}#19, direction=ASC, lat=55.673, lon=12.565]
- * ]] estimatedRowSize[37]
+ * ProjectExec[[abbrev{f}#6090, name{f}#6091, location{f}#6094, country{f}#6095, city{f}#6096]]
+ * \_TopNExec[[Order[scalerank{f}#6092,ASC,LAST], Order[distance{r}#6079,ASC,LAST]],15[INTEGER],233]
+ * \_ExchangeExec[[abbrev{f}#6090, city{f}#6096, country{f}#6095, location{f}#6094, name{f}#6091, scalerank{f}#6092, distance{r}
+ * #6079],false]
+ * \_ProjectExec[[abbrev{f}#6090, city{f}#6096, country{f}#6095, location{f}#6094, name{f}#6091, scalerank{f}#6092, distance{r}
+ * #6079]]
+ * \_FieldExtractExec[abbrev{f}#6090, city{f}#6096, country{f}#6095, name..]<[],[]>
+ * \_EvalExec[[STDISTANCE(location{f}#6094,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan
+ * ce#6079]]
+ * \_FieldExtractExec[location{f}#6094]<[],[]>
+ * \_EsQueryExec[airports], indexMode[standard], query[
+ * {"bool":{"filter":[
+ * {"esql_single_value":{"field":"scalerank","next":{"range":
+ * {"scalerank":{"lt":6,"boost":0.0}}},"source":"scalerank < 6@3:31"}},
+ * {"bool":{"must":[
+ * {"geo_shape": {"location":{"relation":"INTERSECTS","shape":
+ * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}},
+ * {"geo_shape":{"location":{"relation":"DISJOINT","shape":
+ * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}}
+ * ],"boost":1.0}}],"boost":1.0}}
+ * ][_doc{f}#6107], limit[15], sort[
+ * [FieldSort[field=scalerank{f}#6092, direction=ASC, nulls=LAST],
+ * GeoDistanceSort[field=location{f}#6094, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[249]
*
*/
public void testPushCompoundTopNDistanceWithCompoundFilterToSource() {
@@ -6413,9 +6431,9 @@ public void testPushCompoundTopNDistanceWithCompoundFilterToSource() {
var exchange = asRemoteExchange(topN.child());
project = as(exchange.child(), ProjectExec.class);
- assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "scalerank", "distance"));
+ assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "scalerank", "distance"));
var extract = as(project.child(), FieldExtractExec.class);
- assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city", "scalerank"));
+ assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name", "scalerank"));
var evalExec = as(extract.child(), EvalExec.class);
var alias = as(evalExec.fields().get(0), Alias.class);
assertThat(alias.name(), is("distance"));
@@ -8053,7 +8071,7 @@ public void testNotEqualsPushdownToDelegate() {
* ges{f}#5, last_name{f}#6, long_noidx{f}#12, salary{f}#7],false]
* \_ProjectExec[[_meta_field{f}#8, emp_no{f}#2, first_name{f}#3, gender{f}#4, hire_date{f}#9, job{f}#10, job.raw{f}#11, langua
* ges{f}#5, last_name{f}#6, long_noidx{f}#12, salary{f}#7]]
- * \_FieldExtractExec[_meta_field{f}#8, emp_no{f}#2, first_name{f}#3, gen..]<[],[]>
+ * \_FieldExtractExec[_meta_field{f}#8, emp_no{f}#2, first_name{f}#3, gen..]<[],[]>
* \_EsQueryExec[test], indexMode[standard],
* query[{"bool":{"filter":[{"sampling":{"probability":0.1,"seed":234,"hash":0}}],"boost":1.0}}]
* [_doc{f}#24], limit[1000], sort[] estimatedRowSize[332]
From e3cf9bbd2cd17fa72acdc23cb42007d22929c541 Mon Sep 17 00:00:00 2001
From: Jan Kuipers <148754765+jan-elastic@users.noreply.github.com>
Date: Thu, 17 Jul 2025 10:24:30 +0200
Subject: [PATCH 4/7] ES|QL categorize options (#131104)
* ES|QL categorize options
* refactor options
* fix serialization
* polish
* add verfications
* better test coverage + polish code
* better test coverage + polish code
---
.../compute/operator/AggregatorBenchmark.java | 2 +-
.../functionNamedParams/categorize.md | 13 ++
.../_snippets/functions/layout/categorize.md | 3 +
.../functions/parameters/categorize.md | 3 +
.../_snippets/functions/types/categorize.md | 8 +-
.../esql/images/functions/categorize.svg | 2 +-
.../org/elasticsearch/TransportVersions.java | 1 +
.../aggregation/blockhash/BlockHash.java | 30 +++--
.../blockhash/CategorizeBlockHash.java | 39 ++++--
.../CategorizePackedValuesBlockHash.java | 10 +-
.../blockhash/CategorizeBlockHashTests.java | 110 ++++++++++------
.../CategorizePackedValuesBlockHashTests.java | 19 ++-
.../blockhash/TopNBlockHashTests.java | 2 +-
.../HashAggregationOperatorTests.java | 4 +-
.../src/main/resources/categorize.csv-spec | 81 +++++++++++-
.../xpack/esql/action/EsqlCapabilities.java | 11 +-
.../esql/expression/function/Options.java | 107 ++++++++++++++++
.../function/fulltext/FullTextFunction.java | 71 -----------
.../expression/function/fulltext/Match.java | 12 +-
.../function/fulltext/MatchPhrase.java | 10 +-
.../function/fulltext/MultiMatch.java | 3 +-
.../function/fulltext/QueryString.java | 10 +-
.../function/grouping/Categorize.java | 119 ++++++++++++++++--
.../esql/expression/function/vector/Knn.java | 38 +-----
...laceAggregateNestedExpressionWithEval.java | 7 +-
.../AbstractPhysicalOperationProviders.java | 8 +-
.../xpack/esql/analysis/VerifierTests.java | 51 ++++++++
.../grouping/CategorizeErrorTests.java | 2 +-
.../function/grouping/CategorizeTests.java | 2 +-
.../rules/logical/FoldNullTests.java | 2 +-
.../SerializableTokenListCategory.java | 7 ++
31 files changed, 572 insertions(+), 215 deletions(-)
create mode 100644 docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md
create mode 100644 x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java
diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java
index d144d7601349d..d5fe1b4a697e0 100644
--- a/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java
+++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java
@@ -191,7 +191,7 @@ private static Operator operator(DriverContext driverContext, String grouping, S
new BlockHash.GroupSpec(2, ElementType.BYTES_REF)
);
case TOP_N_LONGS -> List.of(
- new BlockHash.GroupSpec(0, ElementType.LONG, false, new BlockHash.TopNDef(0, true, true, TOP_N_LIMIT))
+ new BlockHash.GroupSpec(0, ElementType.LONG, null, new BlockHash.TopNDef(0, true, true, TOP_N_LIMIT))
);
default -> throw new IllegalArgumentException("unsupported grouping [" + grouping + "]");
};
diff --git a/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md
new file mode 100644
index 0000000000000..acd2064002b44
--- /dev/null
+++ b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md
@@ -0,0 +1,13 @@
+% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it.
+
+**Supported function named parameters**
+
+`output_format`
+: (boolean) The output format of the categories. Defaults to regex.
+
+`similarity_threshold`
+: (boolean) The minimum percentage of token weight that must match for text to be added to the category bucket. Must be between 1 and 100. The larger the value the narrower the categories. Larger values will increase memory usage and create narrower categories. Defaults to 70.
+
+`analyzer`
+: (keyword) Analyzer used to convert the field into tokens for text categorization.
+
diff --git a/docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md b/docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md
index ca23c1e2efc23..2e331187665f4 100644
--- a/docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md
+++ b/docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md
@@ -19,5 +19,8 @@
:::{include} ../types/categorize.md
:::
+:::{include} ../functionNamedParams/categorize.md
+:::
+
:::{include} ../examples/categorize.md
:::
diff --git a/docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md b/docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md
index 8733908754570..c013b67375a3d 100644
--- a/docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md
+++ b/docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md
@@ -5,3 +5,6 @@
`field`
: Expression to categorize
+`options`
+: (Optional) Categorize additional options as [function named parameters](/reference/query-languages/esql/esql-syntax.md#esql-function-named-params).
+
diff --git a/docs/reference/query-languages/esql/_snippets/functions/types/categorize.md b/docs/reference/query-languages/esql/_snippets/functions/types/categorize.md
index 6043fbe719ff8..8ebe22b61286c 100644
--- a/docs/reference/query-languages/esql/_snippets/functions/types/categorize.md
+++ b/docs/reference/query-languages/esql/_snippets/functions/types/categorize.md
@@ -2,8 +2,8 @@
**Supported types**
-| field | result |
-| --- | --- |
-| keyword | keyword |
-| text | keyword |
+| field | options | result |
+| --- | --- | --- |
+| keyword | | keyword |
+| text | | keyword |
diff --git a/docs/reference/query-languages/esql/images/functions/categorize.svg b/docs/reference/query-languages/esql/images/functions/categorize.svg
index bbb2bda7c480b..7629b9bb978ba 100644
--- a/docs/reference/query-languages/esql/images/functions/categorize.svg
+++ b/docs/reference/query-languages/esql/images/functions/categorize.svg
@@ -1 +1 @@
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
index ae0ccecf15ed7..99c255acf7268 100644
--- a/server/src/main/java/org/elasticsearch/TransportVersions.java
+++ b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -340,6 +340,7 @@ static TransportVersion def(int id) {
public static final TransportVersion ESQL_FIXED_INDEX_LIKE = def(9_119_0_00);
public static final TransportVersion LOOKUP_JOIN_CCS = def(9_120_0_00);
public static final TransportVersion NODE_USAGE_STATS_FOR_THREAD_POOLS_IN_CLUSTER_INFO = def(9_121_0_00);
+ public static final TransportVersion ESQL_CATEGORIZE_OPTIONS = def(9_122_0_00);
/*
* STOP! READ THIS FIRST! No, really,
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java
index 1cae296f09c02..63f4d9c96bcd0 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java
@@ -128,16 +128,26 @@ public abstract class BlockHash implements Releasable, SeenGroupIds {
public record TopNDef(int order, boolean asc, boolean nullsFirst, int limit) {}
/**
- * @param isCategorize Whether this group is a CATEGORIZE() or not.
- * May be changed in the future when more stateful grouping functions are added.
+ * Configuration for a BlockHash group spec that is doing text categorization.
*/
- public record GroupSpec(int channel, ElementType elementType, boolean isCategorize, @Nullable TopNDef topNDef) {
+ public record CategorizeDef(String analyzer, OutputFormat outputFormat, int similarityThreshold) {
+ public enum OutputFormat {
+ REGEX,
+ TOKENS
+ }
+ }
+
+ public record GroupSpec(int channel, ElementType elementType, @Nullable CategorizeDef categorizeDef, @Nullable TopNDef topNDef) {
public GroupSpec(int channel, ElementType elementType) {
- this(channel, elementType, false, null);
+ this(channel, elementType, null, null);
+ }
+
+ public GroupSpec(int channel, ElementType elementType, CategorizeDef categorizeDef) {
+ this(channel, elementType, categorizeDef, null);
}
- public GroupSpec(int channel, ElementType elementType, boolean isCategorize) {
- this(channel, elementType, isCategorize, null);
+ public boolean isCategorize() {
+ return categorizeDef != null;
}
}
@@ -207,7 +217,13 @@ public static BlockHash buildCategorizeBlockHash(
int emitBatchSize
) {
if (groups.size() == 1) {
- return new CategorizeBlockHash(blockFactory, groups.get(0).channel, aggregatorMode, analysisRegistry);
+ return new CategorizeBlockHash(
+ blockFactory,
+ groups.get(0).channel,
+ aggregatorMode,
+ groups.get(0).categorizeDef,
+ analysisRegistry
+ );
} else {
assert groups.get(0).isCategorize();
assert groups.subList(1, groups.size()).stream().noneMatch(GroupSpec::isCategorize);
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java
index 5e716d8c9d5ff..fcc1a7f3d271e 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java
@@ -18,7 +18,6 @@
import org.elasticsearch.common.util.BytesRefHash;
import org.elasticsearch.compute.aggregation.AggregatorMode;
import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction;
-import org.elasticsearch.compute.aggregation.SeenGroupIds;
import org.elasticsearch.compute.data.Block;
import org.elasticsearch.compute.data.BlockFactory;
import org.elasticsearch.compute.data.BytesRefBlock;
@@ -47,12 +46,13 @@
*/
public class CategorizeBlockHash extends BlockHash {
- private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig
+ private static final CategorizationAnalyzerConfig DEFAULT_ANALYZER_CONFIG = CategorizationAnalyzerConfig
.buildStandardEsqlCategorizationAnalyzer();
private static final int NULL_ORD = 0;
private final int channel;
private final AggregatorMode aggregatorMode;
+ private final CategorizeDef categorizeDef;
private final TokenListCategorizer.CloseableTokenListCategorizer categorizer;
private final CategorizeEvaluator evaluator;
@@ -64,28 +64,38 @@ public class CategorizeBlockHash extends BlockHash {
*/
private boolean seenNull = false;
- CategorizeBlockHash(BlockFactory blockFactory, int channel, AggregatorMode aggregatorMode, AnalysisRegistry analysisRegistry) {
+ CategorizeBlockHash(
+ BlockFactory blockFactory,
+ int channel,
+ AggregatorMode aggregatorMode,
+ CategorizeDef categorizeDef,
+ AnalysisRegistry analysisRegistry
+ ) {
super(blockFactory);
this.channel = channel;
this.aggregatorMode = aggregatorMode;
+ this.categorizeDef = categorizeDef;
this.categorizer = new TokenListCategorizer.CloseableTokenListCategorizer(
new CategorizationBytesRefHash(new BytesRefHash(2048, blockFactory.bigArrays())),
CategorizationPartOfSpeechDictionary.getInstance(),
- 0.70f
+ categorizeDef.similarityThreshold() / 100.0f
);
if (aggregatorMode.isInputPartial() == false) {
- CategorizationAnalyzer analyzer;
+ CategorizationAnalyzer categorizationAnalyzer;
try {
Objects.requireNonNull(analysisRegistry);
- analyzer = new CategorizationAnalyzer(analysisRegistry, ANALYZER_CONFIG);
- } catch (Exception e) {
+ CategorizationAnalyzerConfig config = categorizeDef.analyzer() == null
+ ? DEFAULT_ANALYZER_CONFIG
+ : new CategorizationAnalyzerConfig.Builder().setAnalyzer(categorizeDef.analyzer()).build();
+ categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, config);
+ } catch (IOException e) {
categorizer.close();
throw new RuntimeException(e);
}
- this.evaluator = new CategorizeEvaluator(analyzer);
+ this.evaluator = new CategorizeEvaluator(categorizationAnalyzer);
} else {
this.evaluator = null;
}
@@ -114,7 +124,7 @@ public IntVector nonEmpty() {
@Override
public BitArray seenGroupIds(BigArrays bigArrays) {
- return new SeenGroupIds.Range(seenNull ? 0 : 1, Math.toIntExact(categorizer.getCategoryCount() + 1)).seenGroupIds(bigArrays);
+ return new Range(seenNull ? 0 : 1, Math.toIntExact(categorizer.getCategoryCount() + 1)).seenGroupIds(bigArrays);
}
@Override
@@ -222,7 +232,7 @@ private Block buildFinalBlock() {
try (BytesRefBlock.Builder result = blockFactory.newBytesRefBlockBuilder(categorizer.getCategoryCount())) {
result.appendNull();
for (SerializableTokenListCategory category : categorizer.toCategoriesById()) {
- scratch.copyChars(category.getRegex());
+ scratch.copyChars(getKeyString(category));
result.appendBytesRef(scratch.get());
scratch.clear();
}
@@ -232,7 +242,7 @@ private Block buildFinalBlock() {
try (BytesRefVector.Builder result = blockFactory.newBytesRefVectorBuilder(categorizer.getCategoryCount())) {
for (SerializableTokenListCategory category : categorizer.toCategoriesById()) {
- scratch.copyChars(category.getRegex());
+ scratch.copyChars(getKeyString(category));
result.appendBytesRef(scratch.get());
scratch.clear();
}
@@ -240,6 +250,13 @@ private Block buildFinalBlock() {
}
}
+ private String getKeyString(SerializableTokenListCategory category) {
+ return switch (categorizeDef.outputFormat()) {
+ case REGEX -> category.getRegex();
+ case TOKENS -> category.getKeyTokensString();
+ };
+ }
+
/**
* Similar implementation to an Evaluator.
*/
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java
index 20874cb10ceb8..bb5f0dee8ca2d 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java
@@ -56,6 +56,8 @@ public class CategorizePackedValuesBlockHash extends BlockHash {
int emitBatchSize
) {
super(blockFactory);
+ assert specs.get(0).categorizeDef() != null;
+
this.specs = specs;
this.aggregatorMode = aggregatorMode;
blocks = new Block[specs.size()];
@@ -68,7 +70,13 @@ public class CategorizePackedValuesBlockHash extends BlockHash {
boolean success = false;
try {
- categorizeBlockHash = new CategorizeBlockHash(blockFactory, specs.get(0).channel(), aggregatorMode, analysisRegistry);
+ categorizeBlockHash = new CategorizeBlockHash(
+ blockFactory,
+ specs.get(0).channel(),
+ aggregatorMode,
+ specs.get(0).categorizeDef(),
+ analysisRegistry
+ );
packedValuesBlockHash = new PackedValuesBlockHash(delegateSpecs, blockFactory, emitBatchSize);
success = true;
} finally {
diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java
index 842952f9ef8bd..9ce086307acee 100644
--- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java
+++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java
@@ -76,7 +76,13 @@ private void initAnalysisRegistry() throws IOException {
).getAnalysisRegistry();
}
+ private BlockHash.CategorizeDef getCategorizeDef() {
+ return new BlockHash.CategorizeDef(null, randomFrom(BlockHash.CategorizeDef.OutputFormat.values()), 70);
+ }
+
public void testCategorizeRaw() {
+ BlockHash.CategorizeDef categorizeDef = getCategorizeDef();
+
final Page page;
boolean withNull = randomBoolean();
final int positions = 7 + (withNull ? 1 : 0);
@@ -98,7 +104,7 @@ public void testCategorizeRaw() {
page = new Page(builder.build());
}
- try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, analysisRegistry)) {
+ try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, categorizeDef, analysisRegistry)) {
for (int i = randomInt(2); i < 3; i++) {
hash.add(page, new GroupingAggregatorFunction.AddInput() {
private void addBlock(int positionOffset, IntBlock groupIds) {
@@ -137,7 +143,10 @@ public void close() {
}
});
- assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?");
+ switch (categorizeDef.outputFormat()) {
+ case REGEX -> assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?");
+ case TOKENS -> assertHashState(hash, withNull, "Connected to", "Connection error", "Disconnected");
+ }
}
} finally {
page.releaseBlocks();
@@ -145,6 +154,8 @@ public void close() {
}
public void testCategorizeRawMultivalue() {
+ BlockHash.CategorizeDef categorizeDef = getCategorizeDef();
+
final Page page;
boolean withNull = randomBoolean();
final int positions = 3 + (withNull ? 1 : 0);
@@ -170,7 +181,7 @@ public void testCategorizeRawMultivalue() {
page = new Page(builder.build());
}
- try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, analysisRegistry)) {
+ try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, categorizeDef, analysisRegistry)) {
for (int i = randomInt(2); i < 3; i++) {
hash.add(page, new GroupingAggregatorFunction.AddInput() {
private void addBlock(int positionOffset, IntBlock groupIds) {
@@ -216,7 +227,10 @@ public void close() {
}
});
- assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?");
+ switch (categorizeDef.outputFormat()) {
+ case REGEX -> assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?");
+ case TOKENS -> assertHashState(hash, withNull, "Connected to", "Connection error", "Disconnected");
+ }
}
} finally {
page.releaseBlocks();
@@ -224,6 +238,8 @@ public void close() {
}
public void testCategorizeIntermediate() {
+ BlockHash.CategorizeDef categorizeDef = getCategorizeDef();
+
Page page1;
boolean withNull = randomBoolean();
int positions1 = 7 + (withNull ? 1 : 0);
@@ -259,8 +275,8 @@ public void testCategorizeIntermediate() {
// Fill intermediatePages with the intermediate state from the raw hashes
try (
- BlockHash rawHash1 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, analysisRegistry);
- BlockHash rawHash2 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, analysisRegistry);
+ BlockHash rawHash1 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, categorizeDef, analysisRegistry);
+ BlockHash rawHash2 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, categorizeDef, analysisRegistry);
) {
rawHash1.add(page1, new GroupingAggregatorFunction.AddInput() {
private void addBlock(int positionOffset, IntBlock groupIds) {
@@ -335,7 +351,7 @@ public void close() {
page2.releaseBlocks();
}
- try (var intermediateHash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.FINAL, null)) {
+ try (var intermediateHash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.FINAL, categorizeDef, null)) {
intermediateHash.add(intermediatePage1, new GroupingAggregatorFunction.AddInput() {
private void addBlock(int positionOffset, IntBlock groupIds) {
List values = IntStream.range(0, groupIds.getPositionCount())
@@ -403,14 +419,24 @@ public void close() {
}
});
- assertHashState(
- intermediateHash,
- withNull,
- ".*?Connected.+?to.*?",
- ".*?Connection.+?error.*?",
- ".*?Disconnected.*?",
- ".*?System.+?shutdown.*?"
- );
+ switch (categorizeDef.outputFormat()) {
+ case REGEX -> assertHashState(
+ intermediateHash,
+ withNull,
+ ".*?Connected.+?to.*?",
+ ".*?Connection.+?error.*?",
+ ".*?Disconnected.*?",
+ ".*?System.+?shutdown.*?"
+ );
+ case TOKENS -> assertHashState(
+ intermediateHash,
+ withNull,
+ "Connected to",
+ "Connection error",
+ "Disconnected",
+ "System shutdown"
+ );
+ }
}
} finally {
intermediatePage1.releaseBlocks();
@@ -419,6 +445,9 @@ public void close() {
}
public void testCategorize_withDriver() {
+ BlockHash.CategorizeDef categorizeDef = getCategorizeDef();
+ BlockHash.GroupSpec groupSpec = new BlockHash.GroupSpec(0, ElementType.BYTES_REF, categorizeDef);
+
BigArrays bigArrays = new MockBigArrays(PageCacheRecycler.NON_RECYCLING_INSTANCE, ByteSizeValue.ofMb(256)).withCircuitBreaking();
CircuitBreaker breaker = bigArrays.breakerService().getBreaker(CircuitBreaker.REQUEST);
DriverContext driverContext = new DriverContext(bigArrays, new BlockFactory(breaker, bigArrays));
@@ -477,7 +506,7 @@ public void testCategorize_withDriver() {
new LocalSourceOperator(input1),
List.of(
new HashAggregationOperator.HashAggregationOperatorFactory(
- List.of(makeGroupSpec()),
+ List.of(groupSpec),
AggregatorMode.INITIAL,
List.of(
new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(AggregatorMode.INITIAL, List.of(1)),
@@ -496,7 +525,7 @@ public void testCategorize_withDriver() {
new LocalSourceOperator(input2),
List.of(
new HashAggregationOperator.HashAggregationOperatorFactory(
- List.of(makeGroupSpec()),
+ List.of(groupSpec),
AggregatorMode.INITIAL,
List.of(
new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(AggregatorMode.INITIAL, List.of(1)),
@@ -517,7 +546,7 @@ public void testCategorize_withDriver() {
new CannedSourceOperator(intermediateOutput.iterator()),
List.of(
new HashAggregationOperator.HashAggregationOperatorFactory(
- List.of(makeGroupSpec()),
+ List.of(groupSpec),
AggregatorMode.FINAL,
List.of(
new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(AggregatorMode.FINAL, List.of(1, 2)),
@@ -544,23 +573,36 @@ public void testCategorize_withDriver() {
sums.put(outputTexts.getBytesRef(i, new BytesRef()).utf8ToString(), outputSums.getLong(i));
maxs.put(outputTexts.getBytesRef(i, new BytesRef()).utf8ToString(), outputMaxs.getLong(i));
}
+ List keys = switch (categorizeDef.outputFormat()) {
+ case REGEX -> List.of(
+ ".*?aaazz.*?",
+ ".*?bbbzz.*?",
+ ".*?ccczz.*?",
+ ".*?dddzz.*?",
+ ".*?eeezz.*?",
+ ".*?words.+?words.+?words.+?goodbye.*?",
+ ".*?words.+?words.+?words.+?hello.*?"
+ );
+ case TOKENS -> List.of("aaazz", "bbbzz", "ccczz", "dddzz", "eeezz", "words words words goodbye", "words words words hello");
+ };
+
assertThat(
sums,
equalTo(
Map.of(
- ".*?aaazz.*?",
+ keys.get(0),
1L,
- ".*?bbbzz.*?",
+ keys.get(1),
2L,
- ".*?ccczz.*?",
+ keys.get(2),
33L,
- ".*?dddzz.*?",
+ keys.get(3),
44L,
- ".*?eeezz.*?",
+ keys.get(4),
5L,
- ".*?words.+?words.+?words.+?goodbye.*?",
+ keys.get(5),
8888L,
- ".*?words.+?words.+?words.+?hello.*?",
+ keys.get(6),
999L
)
)
@@ -569,19 +611,19 @@ public void testCategorize_withDriver() {
maxs,
equalTo(
Map.of(
- ".*?aaazz.*?",
+ keys.get(0),
1L,
- ".*?bbbzz.*?",
+ keys.get(1),
2L,
- ".*?ccczz.*?",
+ keys.get(2),
30L,
- ".*?dddzz.*?",
+ keys.get(3),
40L,
- ".*?eeezz.*?",
+ keys.get(4),
5L,
- ".*?words.+?words.+?words.+?goodbye.*?",
+ keys.get(5),
8000L,
- ".*?words.+?words.+?words.+?hello.*?",
+ keys.get(6),
900L
)
)
@@ -589,10 +631,6 @@ public void testCategorize_withDriver() {
Releasables.close(() -> Iterators.map(finalOutput.iterator(), (Page p) -> p::releaseBlocks));
}
- private BlockHash.GroupSpec makeGroupSpec() {
- return new BlockHash.GroupSpec(0, ElementType.BYTES_REF, true);
- }
-
private void assertHashState(CategorizeBlockHash hash, boolean withNull, String... expectedKeys) {
// Check the keys
Block[] blocks = null;
diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java
index 734b0660d24a3..d0eb89eafd841 100644
--- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java
+++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java
@@ -74,10 +74,15 @@ public void testCategorize_withDriver() {
DriverContext driverContext = new DriverContext(bigArrays, new BlockFactory(breaker, bigArrays));
boolean withNull = randomBoolean();
boolean withMultivalues = randomBoolean();
+ BlockHash.CategorizeDef categorizeDef = new BlockHash.CategorizeDef(
+ null,
+ randomFrom(BlockHash.CategorizeDef.OutputFormat.values()),
+ 70
+ );
List groupSpecs = List.of(
- new BlockHash.GroupSpec(0, ElementType.BYTES_REF, true),
- new BlockHash.GroupSpec(1, ElementType.INT, false)
+ new BlockHash.GroupSpec(0, ElementType.BYTES_REF, categorizeDef),
+ new BlockHash.GroupSpec(1, ElementType.INT, null)
);
LocalSourceOperator.BlockSupplier input1 = () -> {
@@ -218,8 +223,12 @@ public void testCategorize_withDriver() {
}
Releasables.close(() -> Iterators.map(finalOutput.iterator(), (Page p) -> p::releaseBlocks));
+ List keys = switch (categorizeDef.outputFormat()) {
+ case REGEX -> List.of(".*?connected.+?to.*?", ".*?connection.+?error.*?", ".*?disconnected.*?");
+ case TOKENS -> List.of("connected to", "connection error", "disconnected");
+ };
Map>> expectedResult = Map.of(
- ".*?connected.+?to.*?",
+ keys.get(0),
Map.of(
7,
Set.of("connected to 1.1.1", "connected to 1.1.2", "connected to 1.1.4", "connected to 2.1.2"),
@@ -228,9 +237,9 @@ public void testCategorize_withDriver() {
111,
Set.of("connected to 2.1.1")
),
- ".*?connection.+?error.*?",
+ keys.get(1),
Map.of(7, Set.of("connection error"), 42, Set.of("connection error")),
- ".*?disconnected.*?",
+ keys.get(2),
Map.of(7, Set.of("disconnected"))
);
if (withNull) {
diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java
index f96b9d26f075c..0ebfa7e72b805 100644
--- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java
+++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java
@@ -363,7 +363,7 @@ private void hashBatchesCallbackOnLast(Consumer callback, Block[]..
private BlockHash buildBlockHash(int emitBatchSize, Block... values) {
List specs = new ArrayList<>(values.length);
for (int c = 0; c < values.length; c++) {
- specs.add(new BlockHash.GroupSpec(c, values[c].elementType(), false, topNDef(c)));
+ specs.add(new BlockHash.GroupSpec(c, values[c].elementType(), null, topNDef(c)));
}
assert forcePackedHash == false : "Packed TopN hash not implemented yet";
/*return forcePackedHash
diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java
index 106b9613d7bb2..0e9c0e33d22cd 100644
--- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java
+++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java
@@ -113,7 +113,7 @@ public void testTopNNullsLast() {
try (
var operator = new HashAggregationOperator.HashAggregationOperatorFactory(
- List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, false, new BlockHash.TopNDef(0, ascOrder, false, 3))),
+ List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, null, new BlockHash.TopNDef(0, ascOrder, false, 3))),
mode,
List.of(
new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(mode, aggregatorChannels),
@@ -190,7 +190,7 @@ public void testTopNNullsFirst() {
try (
var operator = new HashAggregationOperator.HashAggregationOperatorFactory(
- List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, false, new BlockHash.TopNDef(0, ascOrder, true, 3))),
+ List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, null, new BlockHash.TopNDef(0, ascOrder, true, 3))),
mode,
List.of(
new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(mode, aggregatorChannels),
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec
index 7168ca3dc398f..be46e68a8b08a 100644
--- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec
+++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec
@@ -397,7 +397,7 @@ FROM sample_data
;
COUNT():long | SUM(event_duration):long | category:keyword
- 7 | 23231327 | null
+ 7 | 23231327 | null
;
on null row
@@ -800,3 +800,82 @@ COUNT():long | VALUES(str):keyword | category:keyword | str:keyword
1 | [a, b, c] | null | b
1 | [a, b, c] | null | c
;
+
+with option output_format regex
+required_capability: categorize_options
+
+FROM sample_data
+ | STATS count=COUNT()
+ BY category=CATEGORIZE(message, {"output_format": "regex"})
+ | SORT count DESC, category
+;
+
+count:long | category:keyword
+ 3 | .*?Connected.+?to.*?
+ 3 | .*?Connection.+?error.*?
+ 1 | .*?Disconnected.*?
+;
+
+with option output_format tokens
+required_capability: categorize_options
+
+FROM sample_data
+ | STATS count=COUNT()
+ BY category=CATEGORIZE(message, {"output_format": "tokens"})
+ | SORT count DESC, category
+;
+
+count:long | category:keyword
+ 3 | Connected to
+ 3 | Connection error
+ 1 | Disconnected
+;
+
+with option similarity_threshold
+required_capability: categorize_options
+
+FROM sample_data
+ | STATS count=COUNT()
+ BY category=CATEGORIZE(message, {"similarity_threshold": 99})
+ | SORT count DESC, category
+;
+
+count:long | category:keyword
+3 | .*?Connection.+?error.*?
+1 | .*?Connected.+?to.+?10\.1\.0\.1.*?
+1 | .*?Connected.+?to.+?10\.1\.0\.2.*?
+1 | .*?Connected.+?to.+?10\.1\.0\.3.*?
+1 | .*?Disconnected.*?
+;
+
+with option analyzer
+required_capability: categorize_options
+
+FROM sample_data
+ | STATS count=COUNT()
+ BY category=CATEGORIZE(message, {"analyzer": "stop"})
+ | SORT count DESC, category
+;
+
+count:long | category:keyword
+3 | .*?connected.*?
+3 | .*?connection.+?error.*?
+1 | .*?disconnected.*?
+;
+
+with all options
+required_capability: categorize_options
+
+FROM sample_data
+ | STATS count=COUNT()
+ BY category=CATEGORIZE(message, {"analyzer": "whitespace", "similarity_threshold": 100, "output_format": "tokens"})
+ | SORT count DESC, category
+;
+
+count:long | category:keyword
+3 | Connection error
+1 | Connected to 10.1.0.1
+1 | Connected to 10.1.0.2
+1 | Connected to 10.1.0.3
+1 | Disconnected
+;
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
index f4f6bd8a12d79..41a92214406bb 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
@@ -1254,10 +1254,12 @@ public enum Cap {
* FUSE command
*/
FUSE(Build.current().isSnapshot()),
+
/**
* Support improved behavior for LIKE operator when used with index fields.
*/
LIKE_ON_INDEX_FIELDS,
+
/**
* Support avg with aggregate metric doubles
*/
@@ -1274,10 +1276,15 @@ public enum Cap {
*/
FAIL_IF_ALL_SHARDS_FAIL(Build.current().isSnapshot()),
- /*
+ /**
* Cosine vector similarity function
*/
- COSINE_VECTOR_SIMILARITY_FUNCTION(Build.current().isSnapshot());
+ COSINE_VECTOR_SIMILARITY_FUNCTION(Build.current().isSnapshot()),
+
+ /**
+ * Support for the options field of CATEGORIZE.
+ */
+ CATEGORIZE_OPTIONS;
private final boolean enabled;
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java
new file mode 100644
index 0000000000000..891d8f1e6c264
--- /dev/null
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java
@@ -0,0 +1,107 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.esql.expression.function;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.xpack.esql.core.InvalidArgumentException;
+import org.elasticsearch.xpack.esql.core.expression.EntryExpression;
+import org.elasticsearch.xpack.esql.core.expression.Expression;
+import org.elasticsearch.xpack.esql.core.expression.Literal;
+import org.elasticsearch.xpack.esql.core.expression.MapExpression;
+import org.elasticsearch.xpack.esql.core.expression.TypeResolutions;
+import org.elasticsearch.xpack.esql.core.tree.Source;
+import org.elasticsearch.xpack.esql.core.type.DataType;
+import org.elasticsearch.xpack.esql.core.type.DataTypeConverter;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Consumer;
+
+import static org.elasticsearch.common.logging.LoggerMessageFormat.format;
+import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isFoldable;
+import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isMapExpression;
+import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull;
+
+public class Options {
+
+ public static Expression.TypeResolution resolve(
+ Expression options,
+ Source source,
+ TypeResolutions.ParamOrdinal paramOrdinal,
+ Map allowedOptions
+ ) {
+ return resolve(options, source, paramOrdinal, allowedOptions, null);
+ }
+
+ public static Expression.TypeResolution resolve(
+ Expression options,
+ Source source,
+ TypeResolutions.ParamOrdinal paramOrdinal,
+ Map allowedOptions,
+ Consumer
*/
@SupportsObservabilityTier(tier = COMPLETE)
-public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction implements LicenseAware {
+public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction implements OptionalArgument, LicenseAware {
public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(
Expression.class,
"Categorize",
Categorize::new
);
+ private static final String ANALYZER = "analyzer";
+ private static final String OUTPUT_FORMAT = "output_format";
+ private static final String SIMILARITY_THRESHOLD = "similarity_threshold";
+
+ private static final Map ALLOWED_OPTIONS = new TreeMap<>(
+ Map.ofEntries(entry(ANALYZER, KEYWORD), entry(OUTPUT_FORMAT, KEYWORD), entry(SIMILARITY_THRESHOLD, INTEGER))
+ );
+
private final Expression field;
+ private final Expression options;
@FunctionInfo(
returnType = "keyword",
@@ -70,21 +97,56 @@ public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction
)
public Categorize(
Source source,
- @Param(name = "field", type = { "text", "keyword" }, description = "Expression to categorize") Expression field
-
+ @Param(name = "field", type = { "text", "keyword" }, description = "Expression to categorize") Expression field,
+ @MapParam(
+ name = "options",
+ description = "(Optional) Categorize additional options as <>.",
+ params = {
+ @MapParam.MapParamEntry(
+ name = ANALYZER,
+ type = "keyword",
+ valueHint = { "standard" },
+ description = "Analyzer used to convert the field into tokens for text categorization."
+ ),
+ @MapParam.MapParamEntry(
+ name = OUTPUT_FORMAT,
+ type = "keyword",
+ valueHint = { "regex", "tokens" },
+ description = "The output format of the categories. Defaults to regex."
+ ),
+ @MapParam.MapParamEntry(
+ name = SIMILARITY_THRESHOLD,
+ type = "integer",
+ valueHint = { "70" },
+ description = "The minimum percentage of token weight that must match for text to be added to the category bucket. "
+ + "Must be between 1 and 100. The larger the value the narrower the categories. "
+ + "Larger values will increase memory usage and create narrower categories. Defaults to 70."
+ ), },
+ optional = true
+ ) Expression options
) {
- super(source, List.of(field));
+ super(source, options == null ? List.of(field) : List.of(field, options));
this.field = field;
+ this.options = options;
}
private Categorize(StreamInput in) throws IOException {
- this(Source.readFrom((PlanStreamInput) in), in.readNamedWriteable(Expression.class));
+ this(
+ Source.readFrom((PlanStreamInput) in),
+ in.readNamedWriteable(Expression.class),
+ in.getTransportVersion().onOrAfter(TransportVersions.ESQL_CATEGORIZE_OPTIONS)
+ ? in.readOptionalNamedWriteable(Expression.class)
+ : null
+ );
}
@Override
public void writeTo(StreamOutput out) throws IOException {
source().writeTo(out);
out.writeNamedWriteable(field);
+ if (out.getTransportVersion().onOrAfter(TransportVersions.ESQL_CATEGORIZE_OPTIONS)) {
+ out.writeOptionalNamedWriteable(options);
+ }
}
@Override
@@ -107,7 +169,48 @@ public Nullability nullable() {
@Override
protected TypeResolution resolveType() {
- return isString(field(), sourceText(), DEFAULT);
+ return isString(field(), sourceText(), DEFAULT).and(
+ Options.resolve(options, source(), SECOND, ALLOWED_OPTIONS, this::verifyOptions)
+ );
+ }
+
+ private void verifyOptions(Map optionsMap) {
+ if (options == null) {
+ return;
+ }
+ Integer similarityThreshold = (Integer) optionsMap.get(SIMILARITY_THRESHOLD);
+ if (similarityThreshold != null) {
+ if (similarityThreshold <= 0 || similarityThreshold > 100) {
+ throw new InvalidArgumentException(
+ format("invalid similarity threshold [{}], expecting a number between 1 and 100, inclusive", similarityThreshold)
+ );
+ }
+ }
+ String outputFormat = (String) optionsMap.get(OUTPUT_FORMAT);
+ if (outputFormat != null) {
+ try {
+ OutputFormat.valueOf(outputFormat.toUpperCase(Locale.ROOT));
+ } catch (IllegalArgumentException e) {
+ throw new InvalidArgumentException(
+ format(null, "invalid output format [{}], expecting one of [REGEX, TOKENS]", outputFormat)
+ );
+ }
+ }
+ }
+
+ public CategorizeDef categorizeDef() {
+ Map optionsMap = new HashMap<>();
+ if (options != null) {
+ Options.populateMap((MapExpression) options, optionsMap, source(), SECOND, ALLOWED_OPTIONS);
+ }
+ Integer similarityThreshold = (Integer) optionsMap.get(SIMILARITY_THRESHOLD);
+ String outputFormatString = (String) optionsMap.get(OUTPUT_FORMAT);
+ OutputFormat outputFormat = outputFormatString == null ? null : OutputFormat.valueOf(outputFormatString.toUpperCase(Locale.ROOT));
+ return new CategorizeDef(
+ (String) optionsMap.get("analyzer"),
+ outputFormat == null ? REGEX : outputFormat,
+ similarityThreshold == null ? 70 : similarityThreshold
+ );
}
@Override
@@ -117,12 +220,12 @@ public DataType dataType() {
@Override
public Categorize replaceChildren(List newChildren) {
- return new Categorize(source(), newChildren.get(0));
+ return new Categorize(source(), newChildren.get(0), newChildren.size() > 1 ? newChildren.get(1) : null);
}
@Override
protected NodeInfo extends Expression> info() {
- return NodeInfo.create(this, Categorize::new, field);
+ return NodeInfo.create(this, Categorize::new, field, options);
}
public Expression field() {
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java
index 61528521c3749..cab5ec862d7f5 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java
@@ -30,6 +30,7 @@
import org.elasticsearch.xpack.esql.expression.function.FunctionInfo;
import org.elasticsearch.xpack.esql.expression.function.MapParam;
import org.elasticsearch.xpack.esql.expression.function.OptionalArgument;
+import org.elasticsearch.xpack.esql.expression.function.Options;
import org.elasticsearch.xpack.esql.expression.function.Param;
import org.elasticsearch.xpack.esql.expression.function.fulltext.FullTextFunction;
import org.elasticsearch.xpack.esql.expression.function.fulltext.Match;
@@ -53,10 +54,10 @@
import static org.elasticsearch.search.vectors.KnnVectorQueryBuilder.NUM_CANDS_FIELD;
import static org.elasticsearch.search.vectors.KnnVectorQueryBuilder.VECTOR_SIMILARITY_FIELD;
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST;
+import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FOURTH;
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND;
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.THIRD;
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isFoldable;
-import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isMapExpression;
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull;
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNullAndFoldable;
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isType;
@@ -198,7 +199,7 @@ public DataType dataType() {
@Override
protected TypeResolution resolveParams() {
- return resolveField().and(resolveQuery()).and(resolveK()).and(resolveOptions());
+ return resolveField().and(resolveQuery()).and(resolveK()).and(Options.resolve(options(), source(), FOURTH, ALLOWED_OPTIONS));
}
private TypeResolution resolveField() {
@@ -221,37 +222,6 @@ private TypeResolution resolveK() {
.and(isNotNull(k(), sourceText(), THIRD));
}
- private TypeResolution resolveOptions() {
- if (options() != null) {
- TypeResolution resolution = isNotNull(options(), sourceText(), TypeResolutions.ParamOrdinal.FOURTH);
- if (resolution.unresolved()) {
- return resolution;
- }
- // MapExpression does not have a DataType associated with it
- resolution = isMapExpression(options(), sourceText(), TypeResolutions.ParamOrdinal.FOURTH);
- if (resolution.unresolved()) {
- return resolution;
- }
-
- try {
- knnQueryOptions();
- } catch (InvalidArgumentException e) {
- return new TypeResolution(e.getMessage());
- }
- }
- return TypeResolution.TYPE_RESOLVED;
- }
-
- private Map knnQueryOptions() throws InvalidArgumentException {
- if (options() == null) {
- return Map.of();
- }
-
- Map matchOptions = new HashMap<>();
- populateOptionsMap((MapExpression) options(), matchOptions, TypeResolutions.ParamOrdinal.FOURTH, sourceText(), ALLOWED_OPTIONS);
- return matchOptions;
- }
-
@Override
public Expression replaceQueryBuilder(QueryBuilder queryBuilder) {
return new Knn(source(), field(), query(), k(), options(), queryBuilder, filterExpressions());
@@ -307,7 +277,7 @@ public Expression withFilters(List filterExpressions) {
private Map queryOptions() throws InvalidArgumentException {
Map options = new HashMap<>();
if (options() != null) {
- populateOptionsMap((MapExpression) options(), options, TypeResolutions.ParamOrdinal.FOURTH, sourceText(), ALLOWED_OPTIONS);
+ Options.populateMap((MapExpression) options(), options, source(), FOURTH, ALLOWED_OPTIONS);
}
return options;
}
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java
index dd7ee26aa84bd..8fe9ccc18c006 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java
@@ -10,6 +10,7 @@
import org.elasticsearch.xpack.esql.core.expression.Alias;
import org.elasticsearch.xpack.esql.core.expression.Attribute;
import org.elasticsearch.xpack.esql.core.expression.Expression;
+import org.elasticsearch.xpack.esql.core.expression.MapExpression;
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
import org.elasticsearch.xpack.esql.core.util.Holder;
import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction;
@@ -137,13 +138,13 @@ private static Expression transformNonEvaluatableGroupingFunction(
List newChildren = new ArrayList<>(gf.children().size());
for (Expression ex : gf.children()) {
- if (ex instanceof Attribute == false) { // TODO: foldables shouldn't require eval'ing either
+ if (ex instanceof Attribute || ex instanceof MapExpression) {
+ newChildren.add(ex);
+ } else { // TODO: foldables shouldn't require eval'ing either
var alias = new Alias(ex.source(), syntheticName(ex, gf, counter++), ex, null, true);
evals.add(alias);
newChildren.add(alias.toAttribute());
childrenChanged = true;
- } else {
- newChildren.add(ex);
}
}
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java
index a5d19fcc3fb14..e45fe2b0e81d8 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java
@@ -343,8 +343,12 @@ BlockHash.GroupSpec toHashGroupSpec() {
if (channel == null) {
throw new EsqlIllegalArgumentException("planned to use ordinals but tried to use the hash instead");
}
-
- return new BlockHash.GroupSpec(channel, elementType(), Alias.unwrap(expression) instanceof Categorize, null);
+ return new BlockHash.GroupSpec(
+ channel,
+ elementType(),
+ Alias.unwrap(expression) instanceof Categorize categorize ? categorize.categorizeDef() : null,
+ null
+ );
}
ElementType elementType() {
diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java
index 4951479514b72..5d4260eb4ee66 100644
--- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java
+++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java
@@ -1972,6 +1972,57 @@ public void testCategorizeWithFilteredAggregations() {
);
}
+ public void testCategorizeInvalidOptionsField() {
+ assumeTrue("categorize options must be enabled", EsqlCapabilities.Cap.CATEGORIZE_OPTIONS.isEnabled());
+
+ assertEquals(
+ "1:31: second argument of [CATEGORIZE(last_name, first_name)] must be a map expression, received [first_name]",
+ error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, first_name)")
+ );
+ assertEquals(
+ "1:31: Invalid option [blah] in [CATEGORIZE(last_name, { \"blah\": 42 })], "
+ + "expected one of [analyzer, output_format, similarity_threshold]",
+ error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"blah\": 42 })")
+ );
+ }
+
+ public void testCategorizeOptionOutputFormat() {
+ assumeTrue("categorize options must be enabled", EsqlCapabilities.Cap.CATEGORIZE_OPTIONS.isEnabled());
+
+ query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"regex\" })");
+ query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"REGEX\" })");
+ query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"tokens\" })");
+ query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"ToKeNs\" })");
+ assertEquals(
+ "1:31: invalid output format [blah], expecting one of [REGEX, TOKENS]",
+ error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"blah\" })")
+ );
+ assertEquals(
+ "1:31: invalid output format [42], expecting one of [REGEX, TOKENS]",
+ error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": 42 })")
+ );
+ }
+
+ public void testCategorizeOptionSimilarityThreshold() {
+ assumeTrue("categorize options must be enabled", EsqlCapabilities.Cap.CATEGORIZE_OPTIONS.isEnabled());
+
+ query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 1 })");
+ query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 100 })");
+ assertEquals(
+ "1:31: invalid similarity threshold [0], expecting a number between 1 and 100, inclusive",
+ error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 0 })")
+ );
+ assertEquals(
+ "1:31: invalid similarity threshold [101], expecting a number between 1 and 100, inclusive",
+ error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 101 })")
+ );
+ assertEquals(
+ "1:31: Invalid option [similarity_threshold] in [CATEGORIZE(last_name, { \"similarity_threshold\": \"blah\" })], "
+ + "cannot cast [blah] to [integer]",
+ error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": \"blah\" })")
+ );
+ }
+
public void testChangePoint() {
assumeTrue("change_point must be enabled", EsqlCapabilities.Cap.CHANGE_POINT.isEnabled());
var airports = AnalyzerTestUtils.analyzer(loadMapping("mapping-airports.json", "airports"));
diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java
index f674f9b2c3d72..97d5b8e3ece96 100644
--- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java
+++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java
@@ -27,7 +27,7 @@ protected List cases() {
@Override
protected Expression build(Source source, List args) {
- return new Categorize(source, args.get(0));
+ return new Categorize(source, args.get(0), args.size() > 1 ? args.get(1) : null);
}
@Override
diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java
index f69bb7eb3e7bb..296d624ee1777 100644
--- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java
+++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java
@@ -61,7 +61,7 @@ public static Iterable