From 117337e6d99df7e6e57341b1987297c7e83f7b88 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 17 Jul 2025 11:44:47 +1000 Subject: [PATCH 1/7] Update regions_by_endpoint for AWS sdk upgrade. Also add test to ensure the file has at least one entry for each region so that it is easy to spot missing regions in future upgrades. Relates: #131050 --- .../repositories/s3/regions_by_endpoint.txt | 16 ++++++++++++++++ .../s3/RegionFromEndpointGuesserTests.java | 13 +++++++++++++ 2 files changed, 29 insertions(+) diff --git a/modules/repository-s3/src/main/resources/org/elasticsearch/repositories/s3/regions_by_endpoint.txt b/modules/repository-s3/src/main/resources/org/elasticsearch/repositories/s3/regions_by_endpoint.txt index 3fae5c314c10b..5ca027a5f4a13 100644 --- a/modules/repository-s3/src/main/resources/org/elasticsearch/repositories/s3/regions_by_endpoint.txt +++ b/modules/repository-s3/src/main/resources/org/elasticsearch/repositories/s3/regions_by_endpoint.txt @@ -6,6 +6,10 @@ ap-east-1 s3-fips.ap-east-1.amazonaws.com ap-east-1 s3-fips.dualstack.ap-east-1.amazonaws.com ap-east-1 s3.ap-east-1.amazonaws.com ap-east-1 s3.dualstack.ap-east-1.amazonaws.com +ap-east-2 s3-fips.ap-east-2.amazonaws.com +ap-east-2 s3-fips.dualstack.ap-east-2.amazonaws.com +ap-east-2 s3.ap-east-2.amazonaws.com +ap-east-2 s3.dualstack.ap-east-2.amazonaws.com ap-northeast-1 s3-fips.ap-northeast-1.amazonaws.com ap-northeast-1 s3-fips.dualstack.ap-northeast-1.amazonaws.com ap-northeast-1 s3.ap-northeast-1.amazonaws.com @@ -56,6 +60,14 @@ aws-iso-b-global s3-fips.aws-iso-b-global.sc2s.sgov.gov aws-iso-b-global s3-fips.dualstack.aws-iso-b-global.sc2s.sgov.gov aws-iso-b-global s3.aws-iso-b-global.sc2s.sgov.gov aws-iso-b-global s3.dualstack.aws-iso-b-global.sc2s.sgov.gov +aws-iso-e-global s3-fips.aws-iso-e-global.cloud.adc-e.uk +aws-iso-e-global s3-fips.dualstack.aws-iso-e-global.cloud.adc-e.uk +aws-iso-e-global s3.aws-iso-e-global.cloud.adc-e.uk +aws-iso-e-global s3.dualstack.aws-iso-e-global.cloud.adc-e.uk +aws-iso-f-global s3-fips.aws-iso-f-global.csp.hci.ic.gov +aws-iso-f-global s3-fips.dualstack.aws-iso-f-global.csp.hci.ic.gov +aws-iso-f-global s3.aws-iso-f-global.csp.hci.ic.gov +aws-iso-f-global s3.dualstack.aws-iso-f-global.csp.hci.ic.gov aws-iso-global s3-fips.aws-iso-global.c2s.ic.gov aws-iso-global s3-fips.dualstack.aws-iso-global.c2s.ic.gov aws-iso-global s3.aws-iso-global.c2s.ic.gov @@ -76,6 +88,10 @@ cn-north-1 s3.cn-north-1.amazonaws.com.cn cn-north-1 s3.dualstack.cn-north-1.amazonaws.com.cn cn-northwest-1 s3.cn-northwest-1.amazonaws.com.cn cn-northwest-1 s3.dualstack.cn-northwest-1.amazonaws.com.cn +eusc-de-east-1 s3-fips.eusc-de-east-1.amazonaws.eu +eusc-de-east-1 s3-fips.dualstack.eusc-de-east-1.amazonaws.eu +eusc-de-east-1 s3.eusc-de-east-1.amazonaws.eu +eusc-de-east-1 s3.dualstack.eusc-de-east-1.amazonaws.eu eu-central-1 s3-fips.dualstack.eu-central-1.amazonaws.com eu-central-1 s3-fips.eu-central-1.amazonaws.com eu-central-1 s3.dualstack.eu-central-1.amazonaws.com diff --git a/modules/repository-s3/src/test/java/org/elasticsearch/repositories/s3/RegionFromEndpointGuesserTests.java b/modules/repository-s3/src/test/java/org/elasticsearch/repositories/s3/RegionFromEndpointGuesserTests.java index 9fe0c40c83979..402181878b600 100644 --- a/modules/repository-s3/src/test/java/org/elasticsearch/repositories/s3/RegionFromEndpointGuesserTests.java +++ b/modules/repository-s3/src/test/java/org/elasticsearch/repositories/s3/RegionFromEndpointGuesserTests.java @@ -9,6 +9,11 @@ package org.elasticsearch.repositories.s3; +import software.amazon.awssdk.endpoints.Endpoint; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.endpoints.S3EndpointParams; +import software.amazon.awssdk.services.s3.endpoints.internal.DefaultS3EndpointProvider; + import org.elasticsearch.core.Nullable; import org.elasticsearch.test.ESTestCase; @@ -23,6 +28,14 @@ public void testRegionGuessing() { assertRegionGuess("random.endpoint.internal.net", null); } + public void testHasEntryForEachRegion() { + final var defaultS3EndpointProvider = new DefaultS3EndpointProvider(); + for (var region : Region.regions()) { + final Endpoint endpoint = safeGet(defaultS3EndpointProvider.resolveEndpoint(S3EndpointParams.builder().region(region).build())); + assertNotNull(region.id(), RegionFromEndpointGuesser.guessRegion(endpoint.url().toString())); + } + } + private static void assertRegionGuess(String endpoint, @Nullable String expectedRegion) { assertEquals(endpoint, expectedRegion, RegionFromEndpointGuesser.guessRegion(endpoint)); } From 1fad2c3e1bf08b8eabbc7db29e4b1650c1fe947f Mon Sep 17 00:00:00 2001 From: elasticsearchmachine <58790826+elasticsearchmachine@users.noreply.github.com> Date: Thu, 17 Jul 2025 09:43:49 +0200 Subject: [PATCH 2/7] Mute org.elasticsearch.packaging.test.DockerTests test072RunEsAsDifferentUserAndGroup #131412 --- muted-tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/muted-tests.yml b/muted-tests.yml index 27ffc88873c70..6f920d95d4189 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -466,6 +466,9 @@ tests: - class: org.elasticsearch.xpack.downsample.DataStreamLifecycleDownsampleDisruptionIT method: testDataStreamLifecycleDownsampleRollingRestart issue: https://github.com/elastic/elasticsearch/issues/131394 +- class: org.elasticsearch.packaging.test.DockerTests + method: test072RunEsAsDifferentUserAndGroup + issue: https://github.com/elastic/elasticsearch/issues/131412 # Examples: # From aadf40c156387793fd6bf3b0e8bd234d5d836d22 Mon Sep 17 00:00:00 2001 From: kanoshiou Date: Thu, 17 Jul 2025 16:23:04 +0800 Subject: [PATCH 3/7] ESQL: Fix inconsistent column order in MV_EXPAND (#129745) The new attribute generated by MV_EXPAND should remain in the original position. The projection added by ProjectAwayColumns does not respect the original order of attributes. Make ProjectAwayColumns respect the order of attributes to fix this. --- docs/changelog/129745.yaml | 6 + .../esql/core/expression/AttributeSet.java | 4 + .../rest/generative/GenerativeRestTest.java | 1 - .../src/main/resources/mv_expand.csv-spec | 62 +++ .../xpack/esql/action/EsqlCapabilities.java | 6 + .../rules/physical/ProjectAwayColumns.java | 14 +- .../optimizer/PhysicalPlanOptimizerTests.java | 500 +++++++++--------- 7 files changed, 350 insertions(+), 243 deletions(-) create mode 100644 docs/changelog/129745.yaml diff --git a/docs/changelog/129745.yaml b/docs/changelog/129745.yaml new file mode 100644 index 0000000000000..35cfd0671bd64 --- /dev/null +++ b/docs/changelog/129745.yaml @@ -0,0 +1,6 @@ +pr: 129745 +summary: "ESQL: Fix `mv_expand` inconsistent column order" +area: ES|QL +type: bug +issues: + - 129000 diff --git a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/AttributeSet.java b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/AttributeSet.java index d281db4e6bf63..fefaf3098319e 100644 --- a/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/AttributeSet.java +++ b/x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/expression/AttributeSet.java @@ -261,5 +261,9 @@ public boolean isEmpty() { public AttributeSet build() { return new AttributeSet(mapBuilder.build()); } + + public void clear() { + mapBuilder.keySet().clear(); + } } } diff --git a/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java b/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java index 2fefcccb4d14f..41b8658302bd7 100644 --- a/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java +++ b/x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java @@ -53,7 +53,6 @@ public abstract class GenerativeRestTest extends ESRestTestCase { "Data too large", // Circuit breaker exceptions eg. https://github.com/elastic/elasticsearch/issues/130072 // Awaiting fixes for correctness - "Expecting the following columns \\[.*\\], got", // https://github.com/elastic/elasticsearch/issues/129000 "Expecting at most \\[.*\\] columns, got \\[.*\\]" // https://github.com/elastic/elasticsearch/issues/129561 ); diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_expand.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_expand.csv-spec index 20ce3ecc5a396..c7dbe01ef6f09 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_expand.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/mv_expand.csv-spec @@ -419,3 +419,65 @@ emp_no:integer | job_positions:keyword 10001 | Accountant 10001 | Senior Python Developer ; + +testMvExpandInconsistentColumnOrder1 +required_capability: fix_mv_expand_inconsistent_column_order +from message_types +| eval foo_1 = 1, foo_2 = 2 +| sort message +| mv_expand foo_1 +; + +message:keyword | type:keyword | foo_1:integer | foo_2:integer +Connected to 10.1.0.1 | Success | 1 | 2 +Connected to 10.1.0.2 | Success | 1 | 2 +Connected to 10.1.0.3 | Success | 1 | 2 +Connection error | Error | 1 | 2 +Development environment | Development | 1 | 2 +Disconnected | Disconnected | 1 | 2 +Production environment | Production | 1 | 2 +; + +testMvExpandInconsistentColumnOrder2 +required_capability: fix_mv_expand_inconsistent_column_order +from message_types +| eval foo_1 = [1, 3], foo_2 = 2 +| sort message +| mv_expand foo_1 +; + +message:keyword | type:keyword | foo_1:integer | foo_2:integer +Connected to 10.1.0.1 | Success | 1 | 2 +Connected to 10.1.0.1 | Success | 3 | 2 +Connected to 10.1.0.2 | Success | 1 | 2 +Connected to 10.1.0.2 | Success | 3 | 2 +Connected to 10.1.0.3 | Success | 1 | 2 +Connected to 10.1.0.3 | Success | 3 | 2 +Connection error | Error | 1 | 2 +Connection error | Error | 3 | 2 +Development environment | Development | 1 | 2 +Development environment | Development | 3 | 2 +Disconnected | Disconnected | 1 | 2 +Disconnected | Disconnected | 3 | 2 +Production environment | Production | 1 | 2 +Production environment | Production | 3 | 2 +; + +testMvExpandInconsistentColumnOrder3 +required_capability: fix_mv_expand_inconsistent_column_order +from message_types +| sort type +| eval language_code = 1, `language_name` = false, message = true, foo_3 = 1, foo_2 = null +| eval foo_3 = "1", `foo_3` = -1, foo_1 = 1, `language_code` = null, `foo_2` = "1" +| mv_expand foo_1 +| limit 5 +; + +type:keyword | language_name:boolean | message:boolean | foo_3:integer | foo_1:integer | language_code:null | foo_2:keyword +Development | false | true | -1 | 1 | null | 1 +Disconnected | false | true | -1 | 1 | null | 1 +Error | false | true | -1 | 1 | null | 1 +Production | false | true | -1 | 1 | null | 1 +Success | false | true | -1 | 1 | null | 1 +; + diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index ac75811602bd6..f4f6bd8a12d79 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -1235,6 +1235,12 @@ public enum Cap { */ NO_PLAIN_STRINGS_IN_LITERALS, + /** + * Support for the mv_expand target attribute should be retained in its original position. + * see ES|QL: inconsistent column order #129000 + */ + FIX_MV_EXPAND_INCONSISTENT_COLUMN_ORDER, + /** * (Re)Added EXPLAIN command */ diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/ProjectAwayColumns.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/ProjectAwayColumns.java index 26cfbf40eb7ff..887fb039a14cb 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/ProjectAwayColumns.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/ProjectAwayColumns.java @@ -76,7 +76,19 @@ public PhysicalPlan apply(PhysicalPlan plan) { // no need for projection when dealing with aggs if (logicalFragment instanceof Aggregate == false) { - List output = new ArrayList<>(requiredAttrBuilder.build()); + // we should respect the order of the attributes + List output = new ArrayList<>(); + for (Attribute attribute : logicalFragment.output()) { + if (requiredAttrBuilder.contains(attribute)) { + output.add(attribute); + requiredAttrBuilder.remove(attribute); + } + } + // requiredAttrBuilder should be empty unless the plan is inconsistent due to a bug. + // This can happen in case of remote ENRICH, see https://github.com/elastic/elasticsearch/issues/118531 + // TODO: stop adding the remaining required attributes once remote ENRICH is fixed. + output.addAll(requiredAttrBuilder.build()); + // if all the fields are filtered out, it's only the count that matters // however until a proper fix (see https://github.com/elastic/elasticsearch/issues/98703) // add a synthetic field (so it doesn't clash with the user defined one) to return a constant diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java index 79d58341783aa..a609a1e494e54 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java @@ -128,6 +128,7 @@ import org.elasticsearch.xpack.esql.plan.physical.LimitExec; import org.elasticsearch.xpack.esql.plan.physical.LocalSourceExec; import org.elasticsearch.xpack.esql.plan.physical.LookupJoinExec; +import org.elasticsearch.xpack.esql.plan.physical.MvExpandExec; import org.elasticsearch.xpack.esql.plan.physical.PhysicalPlan; import org.elasticsearch.xpack.esql.plan.physical.ProjectExec; import org.elasticsearch.xpack.esql.plan.physical.TopNExec; @@ -193,6 +194,7 @@ import static org.hamcrest.Matchers.closeTo; import static org.hamcrest.Matchers.contains; import static org.hamcrest.Matchers.containsInAnyOrder; +import static org.hamcrest.Matchers.containsInRelativeOrder; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.empty; import static org.hamcrest.Matchers.equalTo; @@ -625,16 +627,16 @@ public void testTripleExtractorPerField() { } /** - * Expected - * LimitExec[10000[INTEGER]] - * \_AggregateExec[[],[AVG(salary{f}#14) AS x],FINAL] - * \_AggregateExec[[],[AVG(salary{f}#14) AS x],PARTIAL] - * \_FilterExec[ROUND(emp_no{f}#9) > 10[INTEGER]] - * \_TopNExec[[Order[last_name{f}#13,ASC,LAST]],10[INTEGER]] - * \_ExchangeExec[] - * \_ProjectExec[[salary{f}#14, first_name{f}#10, emp_no{f}#9, last_name{f}#13]] -- project away _doc - * \_FieldExtractExec[salary{f}#14, first_name{f}#10, emp_no{f}#9, last_n..] -- local field extraction - * \_EsQueryExec[test], query[][_doc{f}#16], limit[10], sort[[last_name]] + *LimitExec[10000[INTEGER],8] + * \_AggregateExec[[],[SUM(salary{f}#13460,true[BOOLEAN]) AS x#13454],FINAL,[$$x$sum{r}#13466, $$x$seen{r}#13467],8] + * \_AggregateExec[[],[SUM(salary{f}#13460,true[BOOLEAN]) AS x#13454],INITIAL,[$$x$sum{r}#13466, $$x$seen{r}#13467],8] + * \_FilterExec[ROUND(emp_no{f}#13455) > 10[INTEGER]] + * \_TopNExec[[Order[last_name{f}#13459,ASC,LAST]],10[INTEGER],58] + * \_ExchangeExec[[emp_no{f}#13455, last_name{f}#13459, salary{f}#13460],false] + * \_ProjectExec[[emp_no{f}#13455, last_name{f}#13459, salary{f}#13460]] -- project away _doc + * \_FieldExtractExec[emp_no{f}#13455, last_name{f}#13459, salary{f}#1346..] <[],[]> -- local field extraction + * \_EsQueryExec[test], indexMode[standard], query[][_doc{f}#13482], limit[10], + * sort[[FieldSort[field=last_name{f}#13459, direction=ASC, nulls=LAST]]] estimatedRowSize[74] */ public void testExtractorForField() { var plan = physicalPlan(""" @@ -658,7 +660,7 @@ public void testExtractorForField() { var exchange = asRemoteExchange(topN.child()); var project = as(exchange.child(), ProjectExec.class); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("salary", "emp_no", "last_name")); + assertThat(names(extract.attributesToExtract()), contains("emp_no", "last_name", "salary")); var source = source(extract.child()); assertThat(source.limit(), is(topN.limit())); assertThat(source.sorts(), is(fieldSorts(topN.order()))); @@ -2219,7 +2221,7 @@ public void testNoPushDownChangeCase() { * ages{f}#6, last_name{f}#7, long_noidx{f}#13, salary{f}#8],false] * \_ProjectExec[[_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gender{f}#5, hire_date{f}#10, job{f}#11, job.raw{f}#12, langu * ages{f}#6, last_name{f}#7, long_noidx{f}#13, salary{f}#8]] - * \_FieldExtractExec[_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gen..]<[],[]> + * \_FieldExtractExec[_meta_field{f}#9, emp_no{f}#3, first_name{f}#4, gen..]<[],[]> * \_EsQueryExec[test], indexMode[standard], query[{"esql_single_value":{"field":"first_name","next":{"regexp":{"first_name": * {"value":"foo*","flags_value":65791,"case_insensitive":true,"max_determinized_states":10000,"boost":0.0}}}, * "source":"TO_LOWER(first_name) RLIKE \"foo*\"@2:9"}}][_doc{f}#25], limit[1000], sort[] estimatedRowSize[332] @@ -2340,10 +2342,10 @@ public void testPushDownUpperCaseChangeLike() { * uages{f}#7, last_name{f}#8, long_noidx{f}#14, salary{f}#9],false] * \_ProjectExec[[_meta_field{f}#10, emp_no{f}#4, first_name{f}#5, gender{f}#6, hire_date{f}#11, job{f}#12, job.raw{f}#13, lang * uages{f}#7, last_name{f}#8, long_noidx{f}#14, salary{f}#9]] - * \_FieldExtractExec[_meta_field{f}#10, gender{f}#6, hire_date{f}#11, jo..]<[],[]> + * \_FieldExtractExec[_meta_field{f}#10, gender{f}#6, hire_date{f}#11, jo..]<[],[]> * \_LimitExec[1000[INTEGER]] * \_FilterExec[LIKE(first_name{f}#5, "FOO*", true) OR IN(1[INTEGER],2[INTEGER],3[INTEGER],emp_no{f}#4 + 1[INTEGER])] - * \_FieldExtractExec[first_name{f}#5, emp_no{f}#4]<[],[]> + * \_FieldExtractExec[first_name{f}#5, emp_no{f}#4]<[],[]> * \_EsQueryExec[test], indexMode[standard], query[][_doc{f}#26], limit[], sort[] estimatedRowSize[332] */ public void testChangeCaseAsInsensitiveWildcardLikeNotPushedDown() { @@ -2458,22 +2460,17 @@ public void testPushDownEvalFilter() { /** * - * ProjectExec[[last_name{f}#21 AS name, first_name{f}#18 AS last_name, last_name{f}#21 AS first_name]] - * \_TopNExec[[Order[last_name{f}#21,ASC,LAST]],10[INTEGER],0] - * \_ExchangeExec[[last_name{f}#21, first_name{f}#18],false] - * \_ProjectExec[[last_name{f}#21, first_name{f}#18]] - * \_FieldExtractExec[last_name{f}#21, first_name{f}#18][] - * \_EsQueryExec[test], indexMode[standard], query[{ - * "bool":{"must":[ - * {"esql_single_value":{ - * "field":"last_name", - * "next":{"range":{"last_name":{"gt":"B","boost":1.0}}}, - * "source":"first_name > \"B\"@3:9" - * }}, - * {"exists":{"field":"first_name","boost":1.0}} - * ],"boost":1.0}}][_doc{f}#40], limit[10], sort[[ - * FieldSort[field=last_name{f}#21, direction=ASC, nulls=LAST] - * ]] estimatedRowSize[116] + * ProjectExec[[last_name{f}#13858 AS name#13841, first_name{f}#13855 AS last_name#13844, last_name{f}#13858 AS first_name#13 + * 847]] + * \_TopNExec[[Order[last_name{f}#13858,ASC,LAST]],10[INTEGER],100] + * \_ExchangeExec[[first_name{f}#13855, last_name{f}#13858],false] + * \_ProjectExec[[first_name{f}#13855, last_name{f}#13858]] + * \_FieldExtractExec[first_name{f}#13855, last_name{f}#13858]<[],[]> + * \_EsQueryExec[test], indexMode[standard], query[ + * {"bool":{"must":[{"esql_single_value":{"field":"last_name","next": + * {"range":{"last_name":{"gt":"B","boost":0.0}}},"source":"first_name > \"B\"@3:9"}}, + * {"exists":{"field":"first_name","boost":0.0}}],"boost":1.0}} + * ][_doc{f}#13879], limit[10], sort[[FieldSort[field=last_name{f}#13858, direction=ASC, nulls=LAST]]] estimatedRowSize[116] * */ public void testPushDownEvalSwapFilter() { @@ -2494,7 +2491,7 @@ public void testPushDownEvalSwapFilter() { var extract = as(project.child(), FieldExtractExec.class); assertThat( extract.attributesToExtract().stream().map(Attribute::name).collect(Collectors.toList()), - contains("last_name", "first_name") + contains("first_name", "last_name") ); // Now verify the correct Lucene push-down of both the filter and the sort @@ -2607,7 +2604,7 @@ public void testDissect() { * uages{f}#7, last_name{f}#8, long_noidx{f}#14, salary{f}#9, _index{m}#2],false] * \_ProjectExec[[_meta_field{f}#10, emp_no{f}#4, first_name{f}#5, gender{f}#6, hire_date{f}#11, job{f}#12, job.raw{f}#13, lang * uages{f}#7, last_name{f}#8, long_noidx{f}#14, salary{f}#9, _index{m}#2]] - * \_FieldExtractExec[_meta_field{f}#10, emp_no{f}#4, first_name{f}#5, ge..]<[],[]> + * \_FieldExtractExec[_meta_field{f}#10, emp_no{f}#4, first_name{f}#5, ge..]<[],[]> * \_EsQueryExec[test], indexMode[standard], query[{"wildcard":{"_index":{"wildcard":"test*","boost":0.0}}}][_doc{f}#27], * limit[1000], sort[] estimatedRowSize[382] * @@ -3176,6 +3173,56 @@ public void testProjectAwayAllColumnsWhenOnlyTheCountMattersInStats() { assertThat(Expressions.names(esQuery.attrs()), contains("_doc")); } + /** + * LimitExec[1000[INTEGER],336] + * \_MvExpandExec[foo_1{r}#4236,foo_1{r}#4253] + * \_TopNExec[[Order[emp_no{f}#4242,ASC,LAST]],1000[INTEGER],336] + * \_ExchangeExec[[_meta_field{f}#4248, emp_no{f}#4242, first_name{f}#4243, gender{f}#4244, hire_date{f}#4249, job{f}#4250, job. + * raw{f}#4251, languages{f}#4245, last_name{f}#4246, long_noidx{f}#4252, salary{f}#4247, foo_1{r}#4236, foo_2{r}#4238], + * false] + * \_ProjectExec[[_meta_field{f}#4248, emp_no{f}#4242, first_name{f}#4243, gender{f}#4244, hire_date{f}#4249, job{f}#4250, job. + * raw{f}#4251, languages{f}#4245, last_name{f}#4246, long_noidx{f}#4252, salary{f}#4247, foo_1{r}#4236, foo_2{r}#4238]] + * \_FieldExtractExec[_meta_field{f}#4248, emp_no{f}#4242, first_name{f}#..]<[],[]> + * \_EvalExec[[1[INTEGER] AS foo_1#4236, 1[INTEGER] AS foo_2#4238]] + * \_EsQueryExec[test], indexMode[standard], query[][_doc{f}#4268], limit[1000], sort[[FieldSort[field=emp_no{f}#4242, + * direction=ASC, nulls=LAST]]] estimatedRowSize[352] + */ + public void testProjectAwayMvExpandColumnOrder() { + var plan = optimizedPlan(physicalPlan(""" + from test + | eval foo_1 = 1, foo_2 = 1 + | sort emp_no + | mv_expand foo_1 + """)); + var limit = as(plan, LimitExec.class); + var mvExpand = as(limit.child(), MvExpandExec.class); + var topN = as(mvExpand.child(), TopNExec.class); + var exchange = as(topN.child(), ExchangeExec.class); + var project = as(exchange.child(), ProjectExec.class); + + assertThat( + Expressions.names(project.projections()), + containsInRelativeOrder( + "_meta_field", + "emp_no", + "first_name", + "gender", + "hire_date", + "job", + "job.raw", + "languages", + "last_name", + "long_noidx", + "salary", + "foo_1", + "foo_2" + ) + ); + var fieldExtract = as(project.child(), FieldExtractExec.class); + var eval = as(fieldExtract.child(), EvalExec.class); + EsQueryExec esQuery = as(eval.child(), EsQueryExec.class); + } + /** * ProjectExec[[a{r}#5]] * \_EvalExec[[__a_SUM@81823521{r}#15 / __a_COUNT@31645621{r}#16 AS a]] @@ -5674,16 +5721,15 @@ public void testPushTopNWithFilterToSource() { } /** - * ProjectExec[[abbrev{f}#12321, name{f}#12322, location{f}#12325, country{f}#12326, city{f}#12327]] - * \_TopNExec[[Order[abbrev{f}#12321,ASC,LAST]],5[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#12321, name{f}#12322, location{f}#12325, country{f}#12326, city{f}#12327],false] - * \_ProjectExec[[abbrev{f}#12321, name{f}#12322, location{f}#12325, country{f}#12326, city{f}#12327]] - * \_FieldExtractExec[abbrev{f}#12321, name{f}#12322, location{f}#12325, ..][] + * ProjectExec[[abbrev{f}#4474, name{f}#4475, location{f}#4478, country{f}#4479, city{f}#4480]] + * \_TopNExec[[Order[abbrev{f}#4474,ASC,LAST]],5[INTEGER],221] + * \_ExchangeExec[[abbrev{f}#4474, city{f}#4480, country{f}#4479, location{f}#4478, name{f}#4475],false] + * \_ProjectExec[[abbrev{f}#4474, city{f}#4480, country{f}#4479, location{f}#4478, name{f}#4475]] + * \_FieldExtractExec[abbrev{f}#4474, city{f}#4480, country{f}#4479, loca..]<[],[]> * \_EsQueryExec[airports], - * indexMode[standard], - * query[][_doc{f}#12337], - * limit[5], - * sort[[FieldSort[field=abbrev{f}#12321, direction=ASC, nulls=LAST]]] estimatedRowSize[237] + * indexMode[standard], + * query[][_doc{f}#4490], + * limit[5], sort[[FieldSort[field=abbrev{f}#4474, direction=ASC, nulls=LAST]]] estimatedRowSize[237] */ public void testPushTopNKeywordToSource() { var optimized = optimizedPlan(physicalPlan(""" @@ -5698,9 +5744,9 @@ public void testPushTopNKeywordToSource() { var exchange = asRemoteExchange(topN.child()); project = as(exchange.child(), ProjectExec.class); - assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city")); + assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name")); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "location", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "location", "name")); var source = source(extract.child()); assertThat(source.limit(), is(topN.limit())); assertThat(source.sorts(), is(fieldSorts(topN.order()))); @@ -5716,13 +5762,13 @@ public void testPushTopNKeywordToSource() { /** * - * ProjectExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18, abbrev{f}#12 AS code]] - * \_TopNExec[[Order[abbrev{f}#12,ASC,LAST]],5[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18],false] - * \_ProjectExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18]] - * \_FieldExtractExec[abbrev{f}#12, name{f}#13, location{f}#16, country{f..][] - * \_EsQueryExec[airports], indexMode[standard], query[][_doc{f}#29], limit[5], - * sort[[FieldSort[field=abbrev{f}#12, direction=ASC, nulls=LAST]]] estimatedRowSize[237] + * ProjectExec[[abbrev{f}#7828, name{f}#7829, location{f}#7832, country{f}#7833, city{f}#7834, abbrev{f}#7828 AS code#7820]] + * \_TopNExec[[Order[abbrev{f}#7828,ASC,LAST]],5[INTEGER],221] + * \_ExchangeExec[[abbrev{f}#7828, city{f}#7834, country{f}#7833, location{f}#7832, name{f}#7829],false] + * \_ProjectExec[[abbrev{f}#7828, city{f}#7834, country{f}#7833, location{f}#7832, name{f}#7829]] + * \_FieldExtractExec[abbrev{f}#7828, city{f}#7834, country{f}#7833, loca..]<[],[]> + * \_EsQueryExec[airports], indexMode[standard], query[][_doc{f}#7845], limit[5], + * sort[[FieldSort[field=abbrev{f}#7828, direction=ASC, nulls=LAST]]] estimatedRowSize[237] * */ public void testPushTopNAliasedKeywordToSource() { @@ -5740,9 +5786,9 @@ public void testPushTopNAliasedKeywordToSource() { var exchange = asRemoteExchange(topN.child()); project = as(exchange.child(), ProjectExec.class); - assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city")); + assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name")); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "location", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "location", "name")); var source = source(extract.child()); assertThat(source.limit(), is(topN.limit())); assertThat(source.sorts(), is(fieldSorts(topN.order()))); @@ -5757,19 +5803,19 @@ public void testPushTopNAliasedKeywordToSource() { } /** - * ProjectExec[[abbrev{f}#11, name{f}#12, location{f}#15, country{f}#16, city{f}#17]] - * \_TopNExec[[Order[distance{r}#4,ASC,LAST]],5[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#11, name{f}#12, location{f}#15, country{f}#16, city{f}#17, distance{r}#4],false] - * \_ProjectExec[[abbrev{f}#11, name{f}#12, location{f}#15, country{f}#16, city{f}#17, distance{r}#4]] - * \_FieldExtractExec[abbrev{f}#11, name{f}#12, country{f}#16, city{f}#17][] - * \_EvalExec[[STDISTANCE(location{f}#15,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) - * AS distance]] - * \_FieldExtractExec[location{f}#15][] + * ProjectExec[[abbrev{f}#7283, name{f}#7284, location{f}#7287, country{f}#7288, city{f}#7289]] + * \_TopNExec[[Order[distance{r}#7276,ASC,LAST]],5[INTEGER],229] + * \_ExchangeExec[[abbrev{f}#7283, city{f}#7289, country{f}#7288, location{f}#7287, name{f}#7284, distance{r}#7276],false] + * \_ProjectExec[[abbrev{f}#7283, city{f}#7289, country{f}#7288, location{f}#7287, name{f}#7284, distance{r}#7276]] + * \_FieldExtractExec[abbrev{f}#7283, city{f}#7289, country{f}#7288, name..]<[],[]> + * \_EvalExec[[STDISTANCE(location{f}#7287,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan + * ce#7276]] + * \_FieldExtractExec[location{f}#7287]<[],[]> * \_EsQueryExec[airports], - * indexMode[standard], - * query[][_doc{f}#28], - * limit[5], - * sort[[GeoDistanceSort[field=location{f}#15, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245] + * indexMode[standard], + * query[][_doc{f}#7300], + * limit[5], + * sort[[GeoDistanceSort[field=location{f}#7287, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245] */ public void testPushTopNDistanceToSource() { var optimized = optimizedPlan(physicalPlan(""" @@ -5785,9 +5831,9 @@ public void testPushTopNDistanceToSource() { var exchange = asRemoteExchange(topN.child()); project = as(exchange.child(), ProjectExec.class); - assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "distance")); + assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "distance")); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name")); var evalExec = as(extract.child(), EvalExec.class); var alias = as(evalExec.fields().get(0), Alias.class); assertThat(alias.name(), is("distance")); @@ -5814,20 +5860,19 @@ public void testPushTopNDistanceToSource() { } /** - * ProjectExec[[abbrev{f}#8, name{f}#9, location{f}#12, country{f}#13, city{f}#14]] - * \_TopNExec[[Order[$$order_by$0$0{r}#16,ASC,LAST]],5[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#8, name{f}#9, location{f}#12, country{f}#13, city{f}#14, $$order_by$0$0{r}#16],false] - * \_ProjectExec[[abbrev{f}#8, name{f}#9, location{f}#12, country{f}#13, city{f}#14, $$order_by$0$0{r}#16]] - * \_FieldExtractExec[abbrev{f}#8, name{f}#9, country{f}#13, city{f}#14][] - * \_EvalExec[[ - * STDISTANCE(location{f}#12,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS $$order_by$0$0 - * ]] - * \_FieldExtractExec[location{f}#12][] + *ProjectExec[[abbrev{f}#5258, name{f}#5259, location{f}#5262, country{f}#5263, city{f}#5264]] + * \_TopNExec[[Order[$$order_by$0$0{r}#5266,ASC,LAST]],5[INTEGER],229] + * \_ExchangeExec[[abbrev{f}#5258, city{f}#5264, country{f}#5263, location{f}#5262, name{f}#5259, $$order_by$0$0{r}#5266],false] + * \_ProjectExec[[abbrev{f}#5258, city{f}#5264, country{f}#5263, location{f}#5262, name{f}#5259, $$order_by$0$0{r}#5266]] + * \_FieldExtractExec[abbrev{f}#5258, city{f}#5264, country{f}#5263, name..]<[],[]> + * \_EvalExec[[STDISTANCE(location{f}#5262,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS $$orde + * r_by$0$0#5266]] + * \_FieldExtractExec[location{f}#5262]<[],[]> * \_EsQueryExec[airports], - * indexMode[standard], - * query[][_doc{f}#26], - * limit[5], - * sort[[GeoDistanceSort[field=location{f}#12, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245] + * indexMode[standard], + * query[][_doc{f}#5276], + * limit[5], + * sort[[GeoDistanceSort[field=location{f}#5262, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245] */ public void testPushTopNInlineDistanceToSource() { var optimized = optimizedPlan(physicalPlan(""" @@ -5847,15 +5892,15 @@ public void testPushTopNInlineDistanceToSource() { names(project.projections()), contains( equalTo("abbrev"), - equalTo("name"), - equalTo("location"), - equalTo("country"), equalTo("city"), + equalTo("country"), + equalTo("location"), + equalTo("name"), startsWith("$$order_by$0$") ) ); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name")); var evalExec = as(extract.child(), EvalExec.class); var alias = as(evalExec.fields().get(0), Alias.class); assertThat(alias.name(), startsWith("$$order_by$0$")); @@ -5884,14 +5929,14 @@ public void testPushTopNInlineDistanceToSource() { /** * - * ProjectExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18]] - * \_TopNExec[[Order[distance{r}#4,ASC,LAST]],5[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18, distance{r}#4],false] - * \_ProjectExec[[abbrev{f}#12, name{f}#13, location{f}#16, country{f}#17, city{f}#18, distance{r}#4]] - * \_FieldExtractExec[abbrev{f}#12, name{f}#13, country{f}#17, city{f}#18][] - * \_EvalExec[[STDISTANCE(location{f}#16,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distance - * ]] - * \_FieldExtractExec[location{f}#16][] + * ProjectExec[[abbrev{f}#361, name{f}#362, location{f}#365, country{f}#366, city{f}#367]] + * \_TopNExec[[Order[distance{r}#353,ASC,LAST]],5[INTEGER],229] + * \_ExchangeExec[[abbrev{f}#361, city{f}#367, country{f}#366, location{f}#365, name{f}#362, distance{r}#353],false] + * \_ProjectExec[[abbrev{f}#361, city{f}#367, country{f}#366, location{f}#365, name{f}#362, distance{r}#353]] + * \_FieldExtractExec[abbrev{f}#361, city{f}#367, country{f}#366, name{f}..]<[],[]> + * \_EvalExec[[STDISTANCE(location{f}#365,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distanc + * e#353]] + * \_FieldExtractExec[location{f}#365]<[],[]> * \_EsQueryExec[airports], indexMode[standard], query[ * { * "geo_shape":{ @@ -5904,7 +5949,7 @@ public void testPushTopNInlineDistanceToSource() { * } * } * } - * }][_doc{f}#29], limit[5], sort[[GeoDistanceSort[field=location{f}#16, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245] + * ][_doc{f}#378], limit[5], sort[[GeoDistanceSort[field=location{f}#365, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245] * */ public void testPushTopNDistanceWithFilterToSource() { @@ -5922,9 +5967,9 @@ public void testPushTopNDistanceWithFilterToSource() { var exchange = asRemoteExchange(topN.child()); project = as(exchange.child(), ProjectExec.class); - assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "distance")); + assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "distance")); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name")); var evalExec = as(extract.child(), EvalExec.class); var alias = as(evalExec.fields().get(0), Alias.class); assertThat(alias.name(), is("distance")); @@ -5960,48 +6005,25 @@ public void testPushTopNDistanceWithFilterToSource() { /** * - * ProjectExec[[abbrev{f}#14, name{f}#15, location{f}#18, country{f}#19, city{f}#20]] - * \_TopNExec[[Order[distance{r}#4,ASC,LAST]],5[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#14, name{f}#15, location{f}#18, country{f}#19, city{f}#20, distance{r}#4],false] - * \_ProjectExec[[abbrev{f}#14, name{f}#15, location{f}#18, country{f}#19, city{f}#20, distance{r}#4]] - * \_FieldExtractExec[abbrev{f}#14, name{f}#15, country{f}#19, city{f}#20][] - * \_EvalExec[[STDISTANCE(location{f}#18,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) - * AS distance]] - * \_FieldExtractExec[location{f}#18][] - * \_EsQueryExec[airports], indexMode[standard], query[{ - * "bool":{ - * "filter":[ - * { - * "esql_single_value":{ - * "field":"scalerank", - * "next":{"range":{"scalerank":{"lt":6,"boost":1.0}}}, - * "source":"scalerank lt 6@3:31" - * } - * }, - * { - * "bool":{ - * "must":[ - * {"geo_shape":{ - * "location":{ - * "relation":"INTERSECTS", - * "shape":{"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]} - * } - * }}, - * {"geo_shape":{ - * "location":{ - * "relation":"DISJOINT", - * "shape":{"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]} - * } - * }} - * ], - * "boost":1.0 - * } - * } - * ], - * "boost":1.0 - * }}][_doc{f}#31], limit[5], sort[[ - * GeoDistanceSort[field=location{f}#18, direction=ASC, lat=55.673, lon=12.565] - * ]] estimatedRowSize[245] + * ProjectExec[[abbrev{f}#6367, name{f}#6368, location{f}#6371, country{f}#6372, city{f}#6373]] + * \_TopNExec[[Order[distance{r}#6357,ASC,LAST]],5[INTEGER],229] + * \_ExchangeExec[[abbrev{f}#6367, city{f}#6373, country{f}#6372, location{f}#6371, name{f}#6368, distance{r}#6357],false] + * \_ProjectExec[[abbrev{f}#6367, city{f}#6373, country{f}#6372, location{f}#6371, name{f}#6368, distance{r}#6357]] + * \_FieldExtractExec[abbrev{f}#6367, city{f}#6373, country{f}#6372, name..]<[],[]> + * \_EvalExec[[STDISTANCE(location{f}#6371,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan + * ce#6357]] + * \_FieldExtractExec[location{f}#6371]<[],[]> + * \_EsQueryExec[airports], indexMode[standard], query[ + * {"bool":{"filter":[{"esql_single_value":{"field":"scalerank","next":{"range": + * {"scalerank":{"lt":6,"boost":0.0}}},"source":"scalerank < 6@3:31"}}, + * {"bool":{"must":[{"geo_shape": + * {"location":{"relation":"INTERSECTS","shape": + * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}}, + * {"geo_shape":{"location":{"relation":"DISJOINT","shape": + * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}}] + * ,"boost":1.0}}],"boost":1.0}} + * ][_doc{f}#6384], limit[5], sort[ + * [GeoDistanceSort[field=location{f}#6371, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[245] * */ public void testPushTopNDistanceWithCompoundFilterToSource() { @@ -6019,9 +6041,9 @@ public void testPushTopNDistanceWithCompoundFilterToSource() { var exchange = asRemoteExchange(topN.child()); project = as(exchange.child(), ProjectExec.class); - assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "distance")); + assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "distance")); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name")); var evalExec = as(extract.child(), EvalExec.class); var alias = as(evalExec.fields().get(0), Alias.class); assertThat(alias.name(), is("distance")); @@ -6059,35 +6081,28 @@ public void testPushTopNDistanceWithCompoundFilterToSource() { /** * Tests that multiple sorts, including distance and a field, are pushed down to the source. * - * ProjectExec[[abbrev{f}#25, name{f}#26, location{f}#29, country{f}#30, city{f}#31, scalerank{f}#27, scale{r}#7]] - * \_TopNExec[[ - * Order[distance{r}#4,ASC,LAST], - * Order[scalerank{f}#27,ASC,LAST], - * Order[scale{r}#7,DESC,FIRST], - * Order[loc{r}#10,DESC,FIRST] - * ],5[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#25, name{f}#26, location{f}#29, country{f}#30, city{f}#31, scalerank{f}#27, scale{r}#7, - * distance{r}#4, loc{r}#10],false] - * \_ProjectExec[[abbrev{f}#25, name{f}#26, location{f}#29, country{f}#30, city{f}#31, scalerank{f}#27, scale{r}#7, - * distance{r}#4, loc{r}#10]] - * \_FieldExtractExec[abbrev{f}#25, name{f}#26, country{f}#30, city{f}#31][] - * \_EvalExec[[ - * STDISTANCE(location{f}#29,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distance, - * 10[INTEGER] - scalerank{f}#27 AS scale, TOSTRING(location{f}#29) AS loc - * ]] - * \_FieldExtractExec[location{f}#29, scalerank{f}#27][] - * \_EsQueryExec[airports], indexMode[standard], query[{ - * "bool":{ - * "filter":[ - * {"esql_single_value":{"field":"scalerank","next":{...},"source":"scalerank < 6@3:31"}}, - * {"bool":{ - * "must":[ - * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":{...}}}}, - * {"geo_shape":{"location":{"relation":"DISJOINT","shape":{...}}}} - * ],"boost":1.0}}],"boost":1.0}}][_doc{f}#44], limit[5], sort[[ - * GeoDistanceSort[field=location{f}#29, direction=ASC, lat=55.673, lon=12.565], - * FieldSort[field=scalerank{f}#27, direction=ASC, nulls=LAST] - * ]] estimatedRowSize[303] + * ProjectExec[[abbrev{f}#7429, name{f}#7430, location{f}#7433, country{f}#7434, city{f}#7435, scalerank{f}#7431, scale{r}#74 + * 11]] + * \_TopNExec[[Order[distance{r}#7408,ASC,LAST], Order[scalerank{f}#7431,ASC,LAST], Order[scale{r}#7411,DESC,FIRST], Order[l + * oc{r}#7414,DESC,FIRST]],5[INTEGER],287] + * \_ExchangeExec[[abbrev{f}#7429, city{f}#7435, country{f}#7434, location{f}#7433, name{f}#7430, scalerank{f}#7431, distance{r} + * #7408, scale{r}#7411, loc{r}#7414],false] + * \_ProjectExec[[abbrev{f}#7429, city{f}#7435, country{f}#7434, location{f}#7433, name{f}#7430, scalerank{f}#7431, distance{r} + * #7408, scale{r}#7411, loc{r}#7414]] + * \_FieldExtractExec[abbrev{f}#7429, city{f}#7435, country{f}#7434, name..]<[],[]> + * \_EvalExec[[STDISTANCE(location{f}#7433,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan + * ce#7408, 10[INTEGER] - scalerank{f}#7431 AS scale#7411, TOSTRING(location{f}#7433) AS loc#7414]] + * \_FieldExtractExec[location{f}#7433, scalerank{f}#7431]<[],[]> + * \_EsQueryExec[airports], indexMode[standard], query[ + * {"bool":{"filter":[{"esql_single_value":{"field":"scalerank","next": + * {"range":{"scalerank":{"lt":6,"boost":0.0}}},"source":"scalerank < 6@3:31"}}, + * {"bool":{"must":[{"geo_shape":{"location":{"relation":"INTERSECTS","shape": + * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}}, + * {"geo_shape":{"location":{"relation":"DISJOINT","shape": + * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}}], + * "boost":1.0}}],"boost":1.0}}][_doc{f}#7448], limit[5], sort[ + * [GeoDistanceSort[field=location{f}#7433, direction=ASC, lat=55.673, lon=12.565], + * FieldSort[field=scalerank{f}#7431, direction=ASC, nulls=LAST]]] estimatedRowSize[303] * */ public void testPushTopNDistanceAndPushableFieldWithCompoundFilterToSource() { @@ -6108,10 +6123,10 @@ public void testPushTopNDistanceAndPushableFieldWithCompoundFilterToSource() { project = as(exchange.child(), ProjectExec.class); assertThat( names(project.projections()), - contains("abbrev", "name", "location", "country", "city", "scalerank", "scale", "distance", "loc") + contains("abbrev", "city", "country", "location", "name", "scalerank", "distance", "scale", "loc") ); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name")); var evalExec = as(extract.child(), EvalExec.class); var alias = as(evalExec.fields().get(0), Alias.class); assertThat(alias.name(), is("distance")); @@ -6153,26 +6168,30 @@ public void testPushTopNDistanceAndPushableFieldWithCompoundFilterToSource() { /** * This test shows that if the filter contains a predicate on the same field that is sorted, we cannot push down the sort. * - * ProjectExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scalerank{f}#25 AS scale]] - * \_TopNExec[[Order[distance{r}#4,ASC,LAST], Order[scalerank{f}#25,ASC,LAST]],5[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scalerank{f}#25, distance{r}#4],false] - * \_ProjectExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scalerank{f}#25, distance{r}#4]] - * \_FieldExtractExec[abbrev{f}#23, name{f}#24, country{f}#28, city{f}#29][] - * \_TopNExec[[Order[distance{r}#4,ASC,LAST], Order[scalerank{f}#25,ASC,LAST]],5[INTEGER],208] - * \_FieldExtractExec[scalerank{f}#25][] - * \_FilterExec[SUBSTRING(position{r}#7,1[INTEGER],5[INTEGER]) == [50 4f 49 4e 54][KEYWORD]] - * \_EvalExec[[ - * STDISTANCE(location{f}#27,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distance, - * TOSTRING(location{f}#27) AS position - * ]] - * \_FieldExtractExec[location{f}#27][] - * \_EsQueryExec[airports], indexMode[standard], query[{ - * "bool":{"filter":[ - * {"esql_single_value":{"field":"scalerank","next":{"range":{"scalerank":{"lt":6,"boost":1.0}}},"source":...}}, - * {"bool":{"must":[ - * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":{...}}}}, - * {"geo_shape":{"location":{"relation":"DISJOINT","shape":{...}}}} - * ],"boost":1.0}}],"boost":1.0}}][_doc{f}#42], limit[], sort[] estimatedRowSize[87] + * ProjectExec[[abbrev{f}#4856, name{f}#4857, location{f}#4860, country{f}#4861, city{f}#4862, scalerank{f}#4858 AS scale#484 + * 3]] + * \_TopNExec[[Order[distance{r}#4837,ASC,LAST], Order[scalerank{f}#4858,ASC,LAST]],5[INTEGER],233] + * \_ExchangeExec[[abbrev{f}#4856, city{f}#4862, country{f}#4861, location{f}#4860, name{f}#4857, scalerank{f}#4858, distance{r} + * #4837],false] + * \_ProjectExec[[abbrev{f}#4856, city{f}#4862, country{f}#4861, location{f}#4860, name{f}#4857, scalerank{f}#4858, distance{r} + * #4837]] + * \_FieldExtractExec[abbrev{f}#4856, city{f}#4862, country{f}#4861, name..]<[],[]> + * \_TopNExec[[Order[distance{r}#4837,ASC,LAST], Order[scalerank{f}#4858,ASC,LAST]],5[INTEGER],303] + * \_FieldExtractExec[scalerank{f}#4858]<[],[]> + * \_FilterExec[SUBSTRING(position{r}#4840,1[INTEGER],5[INTEGER]) == POINT[KEYWORD]] + * \_EvalExec[[STDISTANCE(location{f}#4860,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS + * distance#4837, TOSTRING(location{f}#4860) AS position#4840]] + * \_FieldExtractExec[location{f}#4860]<[],[]> + * \_EsQueryExec[airports], indexMode[standard], query[ + * {"bool":{"filter":[ + * {"esql_single_value": + * {"field":"scalerank","next":{"range":{"scalerank":{"lt":6,"boost":0.0}}},"source":"scale < 6@3:93"}}, + * {"bool":{"must":[ + * {"geo_shape":{"location":{"relation":"INTERSECTS","shape": + * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}}, + * {"geo_shape":{"location":{"relation":"DISJOINT","shape": + * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}} + * ],"boost":1.0}}],"boost":1.0}}][_doc{f}#4875], limit[], sort[] estimatedRowSize[87] * */ public void testPushTopNDistanceAndNonPushableEvalWithCompoundFilterToSource() { @@ -6191,9 +6210,9 @@ public void testPushTopNDistanceAndNonPushableEvalWithCompoundFilterToSource() { var exchange = asRemoteExchange(topN.child()); project = as(exchange.child(), ProjectExec.class); - assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "scalerank", "distance")); + assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "scalerank", "distance")); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name")); var topNChild = as(extract.child(), TopNExec.class); extract = as(topNChild.child(), FieldExtractExec.class); assertThat(names(extract.attributesToExtract()), contains("scalerank")); @@ -6228,27 +6247,25 @@ public void testPushTopNDistanceAndNonPushableEvalWithCompoundFilterToSource() { /** * This test shows that if the filter contains a predicate on the same field that is sorted, we cannot push down the sort. * - * ProjectExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scale{r}#10]] - * \_TopNExec[[Order[distance{r}#4,ASC,LAST], Order[scale{r}#10,ASC,LAST]],5[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scale{r}#10, distance{r}#4],false] - * \_ProjectExec[[abbrev{f}#23, name{f}#24, location{f}#27, country{f}#28, city{f}#29, scale{r}#10, distance{r}#4]] - * \_FieldExtractExec[abbrev{f}#23, name{f}#24, country{f}#28, city{f}#29][] - * \_TopNExec[[Order[distance{r}#4,ASC,LAST], Order[scale{r}#10,ASC,LAST]],5[INTEGER],208] - * \_FilterExec[ - * SUBSTRING(position{r}#7,1[INTEGER],5[INTEGER]) == [50 4f 49 4e 54][KEYWORD] - * AND scale{r}#10 > 3[INTEGER] - * ] - * \_EvalExec[[ - * STDISTANCE(location{f}#27,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distance, - * TOSTRING(location{f}#27) AS position, - * 10[INTEGER] - scalerank{f}#25 AS scale - * ]] - * \_FieldExtractExec[location{f}#27, scalerank{f}#25][] - * \_EsQueryExec[airports], indexMode[standard], query[{ - * "bool":{"must":[ - * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":{...}}}}, - * {"geo_shape":{"location":{"relation":"DISJOINT","shape":{...}}}} - * ],"boost":1.0}}][_doc{f}#42], limit[], sort[] estimatedRowSize[91] + *ProjectExec[[abbrev{f}#1447, name{f}#1448, location{f}#1451, country{f}#1452, city{f}#1453, scalerank{r}#1434]] + * \_TopNExec[[Order[distance{r}#1428,ASC,LAST], Order[scalerank{r}#1434,ASC,LAST]],5[INTEGER],233] + * \_ExchangeExec[[abbrev{f}#1447, city{f}#1453, country{f}#1452, location{f}#1451, name{f}#1448, distance{r}#1428, scalerank{r} + * #1434],false] + * \_ProjectExec[[abbrev{f}#1447, city{f}#1453, country{f}#1452, location{f}#1451, name{f}#1448, distance{r}#1428, scalerank{r} + * #1434]] + * \_FieldExtractExec[abbrev{f}#1447, city{f}#1453, country{f}#1452, name..]<[],[]> + * \_TopNExec[[Order[distance{r}#1428,ASC,LAST], Order[scalerank{r}#1434,ASC,LAST]],5[INTEGER],303] + * \_FilterExec[SUBSTRING(position{r}#1431,1[INTEGER],5[INTEGER]) == POINT[KEYWORD] AND scalerank{r}#1434 > 3[INTEGER]] + * \_EvalExec[[STDISTANCE(location{f}#1451,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan + * ce#1428, TOSTRING(location{f}#1451) AS position#1431, 10[INTEGER] - scalerank{f}#1449 AS scalerank#1434]] + * \_FieldExtractExec[location{f}#1451, scalerank{f}#1449]<[],[]> + * \_EsQueryExec[airports], indexMode[standard], query[ + * {"bool":{"must":[ + * {"geo_shape":{"location":{"relation":"INTERSECTS","shape": + * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}}, + * {"geo_shape":{"location":{"relation":"DISJOINT","shape": + * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}} + * ],"boost":1.0}}][_doc{f}#1466], limit[], sort[] estimatedRowSize[91] * */ public void testPushTopNDistanceAndNonPushableEvalsWithCompoundFilterToSource() { @@ -6267,9 +6284,9 @@ public void testPushTopNDistanceAndNonPushableEvalsWithCompoundFilterToSource() var exchange = asRemoteExchange(topN.child()); project = as(exchange.child(), ProjectExec.class); - assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "scalerank", "distance")); + assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "distance", "scalerank")); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name")); var topNChild = as(extract.child(), TopNExec.class); var filter = as(topNChild.child(), FilterExec.class); assertThat(filter.condition(), isA(And.class)); @@ -6344,9 +6361,9 @@ public void testPushTopNDistanceWithCompoundFilterToSourceAndDisjunctiveNonPusha var exchange = asRemoteExchange(topN.child()); project = as(exchange.child(), ProjectExec.class); - assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "scalerank", "distance")); + assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "scalerank", "distance")); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name")); var topNChild = as(extract.child(), TopNExec.class); var filter = as(topNChild.child(), FilterExec.class); assertThat(filter.condition(), isA(Or.class)); @@ -6373,28 +6390,29 @@ public void testPushTopNDistanceWithCompoundFilterToSourceAndDisjunctiveNonPusha /** * - * ProjectExec[[abbrev{f}#15, name{f}#16, location{f}#19, country{f}#20, city{f}#21]] - * \_TopNExec[[Order[scalerank{f}#17,ASC,LAST], Order[distance{r}#4,ASC,LAST]],15[INTEGER],0] - * \_ExchangeExec[[abbrev{f}#15, name{f}#16, location{f}#19, country{f}#20, city{f}#21, scalerank{f}#17, distance{r}#4],false] - * \_ProjectExec[[abbrev{f}#15, name{f}#16, location{f}#19, country{f}#20, city{f}#21, scalerank{f}#17, distance{r}#4]] - * \_FieldExtractExec[abbrev{f}#15, name{f}#16, country{f}#20, city{f}#21, ..][] - * \_EvalExec[[STDISTANCE(location{f}#19,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) - * AS distance]] - * \_FieldExtractExec[location{f}#19][] - * \_EsQueryExec[airports], indexMode[standard], query[{ - * "bool":{ - * "filter":[ - * {"esql_single_value":{"field":"scalerank",...,"source":"scalerank lt 6@3:31"}}, - * {"bool":{"must":[ - * {"geo_shape":{"location":{"relation":"INTERSECTS","shape":{...}}}}, - * {"geo_shape":{"location":{"relation":"DISJOINT","shape":{...}}}} - * ],"boost":1.0}} - * ],"boost":1.0 - * } - * }][_doc{f}#32], limit[], sort[[ - * FieldSort[field=scalerank{f}#17, direction=ASC, nulls=LAST], - * GeoDistanceSort[field=location{f}#19, direction=ASC, lat=55.673, lon=12.565] - * ]] estimatedRowSize[37] + * ProjectExec[[abbrev{f}#6090, name{f}#6091, location{f}#6094, country{f}#6095, city{f}#6096]] + * \_TopNExec[[Order[scalerank{f}#6092,ASC,LAST], Order[distance{r}#6079,ASC,LAST]],15[INTEGER],233] + * \_ExchangeExec[[abbrev{f}#6090, city{f}#6096, country{f}#6095, location{f}#6094, name{f}#6091, scalerank{f}#6092, distance{r} + * #6079],false] + * \_ProjectExec[[abbrev{f}#6090, city{f}#6096, country{f}#6095, location{f}#6094, name{f}#6091, scalerank{f}#6092, distance{r} + * #6079]] + * \_FieldExtractExec[abbrev{f}#6090, city{f}#6096, country{f}#6095, name..]<[],[]> + * \_EvalExec[[STDISTANCE(location{f}#6094,[1 1 0 0 0 e1 7a 14 ae 47 21 29 40 a0 1a 2f dd 24 d6 4b 40][GEO_POINT]) AS distan + * ce#6079]] + * \_FieldExtractExec[location{f}#6094]<[],[]> + * \_EsQueryExec[airports], indexMode[standard], query[ + * {"bool":{"filter":[ + * {"esql_single_value":{"field":"scalerank","next":{"range": + * {"scalerank":{"lt":6,"boost":0.0}}},"source":"scalerank < 6@3:31"}}, + * {"bool":{"must":[ + * {"geo_shape": {"location":{"relation":"INTERSECTS","shape": + * {"type":"Circle","radius":"499999.99999999994m","coordinates":[12.565,55.673]}}}}, + * {"geo_shape":{"location":{"relation":"DISJOINT","shape": + * {"type":"Circle","radius":"10000.000000000002m","coordinates":[12.565,55.673]}}}} + * ],"boost":1.0}}],"boost":1.0}} + * ][_doc{f}#6107], limit[15], sort[ + * [FieldSort[field=scalerank{f}#6092, direction=ASC, nulls=LAST], + * GeoDistanceSort[field=location{f}#6094, direction=ASC, lat=55.673, lon=12.565]]] estimatedRowSize[249] * */ public void testPushCompoundTopNDistanceWithCompoundFilterToSource() { @@ -6413,9 +6431,9 @@ public void testPushCompoundTopNDistanceWithCompoundFilterToSource() { var exchange = asRemoteExchange(topN.child()); project = as(exchange.child(), ProjectExec.class); - assertThat(names(project.projections()), contains("abbrev", "name", "location", "country", "city", "scalerank", "distance")); + assertThat(names(project.projections()), contains("abbrev", "city", "country", "location", "name", "scalerank", "distance")); var extract = as(project.child(), FieldExtractExec.class); - assertThat(names(extract.attributesToExtract()), contains("abbrev", "name", "country", "city", "scalerank")); + assertThat(names(extract.attributesToExtract()), contains("abbrev", "city", "country", "name", "scalerank")); var evalExec = as(extract.child(), EvalExec.class); var alias = as(evalExec.fields().get(0), Alias.class); assertThat(alias.name(), is("distance")); @@ -8053,7 +8071,7 @@ public void testNotEqualsPushdownToDelegate() { * ges{f}#5, last_name{f}#6, long_noidx{f}#12, salary{f}#7],false] * \_ProjectExec[[_meta_field{f}#8, emp_no{f}#2, first_name{f}#3, gender{f}#4, hire_date{f}#9, job{f}#10, job.raw{f}#11, langua * ges{f}#5, last_name{f}#6, long_noidx{f}#12, salary{f}#7]] - * \_FieldExtractExec[_meta_field{f}#8, emp_no{f}#2, first_name{f}#3, gen..]<[],[]> + * \_FieldExtractExec[_meta_field{f}#8, emp_no{f}#2, first_name{f}#3, gen..]<[],[]> * \_EsQueryExec[test], indexMode[standard], * query[{"bool":{"filter":[{"sampling":{"probability":0.1,"seed":234,"hash":0}}],"boost":1.0}}] * [_doc{f}#24], limit[1000], sort[] estimatedRowSize[332] From e3cf9bbd2cd17fa72acdc23cb42007d22929c541 Mon Sep 17 00:00:00 2001 From: Jan Kuipers <148754765+jan-elastic@users.noreply.github.com> Date: Thu, 17 Jul 2025 10:24:30 +0200 Subject: [PATCH 4/7] ES|QL categorize options (#131104) * ES|QL categorize options * refactor options * fix serialization * polish * add verfications * better test coverage + polish code * better test coverage + polish code --- .../compute/operator/AggregatorBenchmark.java | 2 +- .../functionNamedParams/categorize.md | 13 ++ .../_snippets/functions/layout/categorize.md | 3 + .../functions/parameters/categorize.md | 3 + .../_snippets/functions/types/categorize.md | 8 +- .../esql/images/functions/categorize.svg | 2 +- .../org/elasticsearch/TransportVersions.java | 1 + .../aggregation/blockhash/BlockHash.java | 30 +++-- .../blockhash/CategorizeBlockHash.java | 39 ++++-- .../CategorizePackedValuesBlockHash.java | 10 +- .../blockhash/CategorizeBlockHashTests.java | 110 ++++++++++------ .../CategorizePackedValuesBlockHashTests.java | 19 ++- .../blockhash/TopNBlockHashTests.java | 2 +- .../HashAggregationOperatorTests.java | 4 +- .../src/main/resources/categorize.csv-spec | 81 +++++++++++- .../xpack/esql/action/EsqlCapabilities.java | 11 +- .../esql/expression/function/Options.java | 107 ++++++++++++++++ .../function/fulltext/FullTextFunction.java | 71 ----------- .../expression/function/fulltext/Match.java | 12 +- .../function/fulltext/MatchPhrase.java | 10 +- .../function/fulltext/MultiMatch.java | 3 +- .../function/fulltext/QueryString.java | 10 +- .../function/grouping/Categorize.java | 119 ++++++++++++++++-- .../esql/expression/function/vector/Knn.java | 38 +----- ...laceAggregateNestedExpressionWithEval.java | 7 +- .../AbstractPhysicalOperationProviders.java | 8 +- .../xpack/esql/analysis/VerifierTests.java | 51 ++++++++ .../grouping/CategorizeErrorTests.java | 2 +- .../function/grouping/CategorizeTests.java | 2 +- .../rules/logical/FoldNullTests.java | 2 +- .../SerializableTokenListCategory.java | 7 ++ 31 files changed, 572 insertions(+), 215 deletions(-) create mode 100644 docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md create mode 100644 x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java index d144d7601349d..d5fe1b4a697e0 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/AggregatorBenchmark.java @@ -191,7 +191,7 @@ private static Operator operator(DriverContext driverContext, String grouping, S new BlockHash.GroupSpec(2, ElementType.BYTES_REF) ); case TOP_N_LONGS -> List.of( - new BlockHash.GroupSpec(0, ElementType.LONG, false, new BlockHash.TopNDef(0, true, true, TOP_N_LIMIT)) + new BlockHash.GroupSpec(0, ElementType.LONG, null, new BlockHash.TopNDef(0, true, true, TOP_N_LIMIT)) ); default -> throw new IllegalArgumentException("unsupported grouping [" + grouping + "]"); }; diff --git a/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md new file mode 100644 index 0000000000000..acd2064002b44 --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/categorize.md @@ -0,0 +1,13 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Supported function named parameters** + +`output_format` +: (boolean) The output format of the categories. Defaults to regex. + +`similarity_threshold` +: (boolean) The minimum percentage of token weight that must match for text to be added to the category bucket. Must be between 1 and 100. The larger the value the narrower the categories. Larger values will increase memory usage and create narrower categories. Defaults to 70. + +`analyzer` +: (keyword) Analyzer used to convert the field into tokens for text categorization. + diff --git a/docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md b/docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md index ca23c1e2efc23..2e331187665f4 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md +++ b/docs/reference/query-languages/esql/_snippets/functions/layout/categorize.md @@ -19,5 +19,8 @@ :::{include} ../types/categorize.md ::: +:::{include} ../functionNamedParams/categorize.md +::: + :::{include} ../examples/categorize.md ::: diff --git a/docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md b/docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md index 8733908754570..c013b67375a3d 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md +++ b/docs/reference/query-languages/esql/_snippets/functions/parameters/categorize.md @@ -5,3 +5,6 @@ `field` : Expression to categorize +`options` +: (Optional) Categorize additional options as [function named parameters](/reference/query-languages/esql/esql-syntax.md#esql-function-named-params). + diff --git a/docs/reference/query-languages/esql/_snippets/functions/types/categorize.md b/docs/reference/query-languages/esql/_snippets/functions/types/categorize.md index 6043fbe719ff8..8ebe22b61286c 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/types/categorize.md +++ b/docs/reference/query-languages/esql/_snippets/functions/types/categorize.md @@ -2,8 +2,8 @@ **Supported types** -| field | result | -| --- | --- | -| keyword | keyword | -| text | keyword | +| field | options | result | +| --- | --- | --- | +| keyword | | keyword | +| text | | keyword | diff --git a/docs/reference/query-languages/esql/images/functions/categorize.svg b/docs/reference/query-languages/esql/images/functions/categorize.svg index bbb2bda7c480b..7629b9bb978ba 100644 --- a/docs/reference/query-languages/esql/images/functions/categorize.svg +++ b/docs/reference/query-languages/esql/images/functions/categorize.svg @@ -1 +1 @@ -CATEGORIZE(field) \ No newline at end of file +CATEGORIZE(field,options) \ No newline at end of file diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java index ae0ccecf15ed7..99c255acf7268 100644 --- a/server/src/main/java/org/elasticsearch/TransportVersions.java +++ b/server/src/main/java/org/elasticsearch/TransportVersions.java @@ -340,6 +340,7 @@ static TransportVersion def(int id) { public static final TransportVersion ESQL_FIXED_INDEX_LIKE = def(9_119_0_00); public static final TransportVersion LOOKUP_JOIN_CCS = def(9_120_0_00); public static final TransportVersion NODE_USAGE_STATS_FOR_THREAD_POOLS_IN_CLUSTER_INFO = def(9_121_0_00); + public static final TransportVersion ESQL_CATEGORIZE_OPTIONS = def(9_122_0_00); /* * STOP! READ THIS FIRST! No, really, diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java index 1cae296f09c02..63f4d9c96bcd0 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/BlockHash.java @@ -128,16 +128,26 @@ public abstract class BlockHash implements Releasable, SeenGroupIds { public record TopNDef(int order, boolean asc, boolean nullsFirst, int limit) {} /** - * @param isCategorize Whether this group is a CATEGORIZE() or not. - * May be changed in the future when more stateful grouping functions are added. + * Configuration for a BlockHash group spec that is doing text categorization. */ - public record GroupSpec(int channel, ElementType elementType, boolean isCategorize, @Nullable TopNDef topNDef) { + public record CategorizeDef(String analyzer, OutputFormat outputFormat, int similarityThreshold) { + public enum OutputFormat { + REGEX, + TOKENS + } + } + + public record GroupSpec(int channel, ElementType elementType, @Nullable CategorizeDef categorizeDef, @Nullable TopNDef topNDef) { public GroupSpec(int channel, ElementType elementType) { - this(channel, elementType, false, null); + this(channel, elementType, null, null); + } + + public GroupSpec(int channel, ElementType elementType, CategorizeDef categorizeDef) { + this(channel, elementType, categorizeDef, null); } - public GroupSpec(int channel, ElementType elementType, boolean isCategorize) { - this(channel, elementType, isCategorize, null); + public boolean isCategorize() { + return categorizeDef != null; } } @@ -207,7 +217,13 @@ public static BlockHash buildCategorizeBlockHash( int emitBatchSize ) { if (groups.size() == 1) { - return new CategorizeBlockHash(blockFactory, groups.get(0).channel, aggregatorMode, analysisRegistry); + return new CategorizeBlockHash( + blockFactory, + groups.get(0).channel, + aggregatorMode, + groups.get(0).categorizeDef, + analysisRegistry + ); } else { assert groups.get(0).isCategorize(); assert groups.subList(1, groups.size()).stream().noneMatch(GroupSpec::isCategorize); diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java index 5e716d8c9d5ff..fcc1a7f3d271e 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java @@ -18,7 +18,6 @@ import org.elasticsearch.common.util.BytesRefHash; import org.elasticsearch.compute.aggregation.AggregatorMode; import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction; -import org.elasticsearch.compute.aggregation.SeenGroupIds; import org.elasticsearch.compute.data.Block; import org.elasticsearch.compute.data.BlockFactory; import org.elasticsearch.compute.data.BytesRefBlock; @@ -47,12 +46,13 @@ */ public class CategorizeBlockHash extends BlockHash { - private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig + private static final CategorizationAnalyzerConfig DEFAULT_ANALYZER_CONFIG = CategorizationAnalyzerConfig .buildStandardEsqlCategorizationAnalyzer(); private static final int NULL_ORD = 0; private final int channel; private final AggregatorMode aggregatorMode; + private final CategorizeDef categorizeDef; private final TokenListCategorizer.CloseableTokenListCategorizer categorizer; private final CategorizeEvaluator evaluator; @@ -64,28 +64,38 @@ public class CategorizeBlockHash extends BlockHash { */ private boolean seenNull = false; - CategorizeBlockHash(BlockFactory blockFactory, int channel, AggregatorMode aggregatorMode, AnalysisRegistry analysisRegistry) { + CategorizeBlockHash( + BlockFactory blockFactory, + int channel, + AggregatorMode aggregatorMode, + CategorizeDef categorizeDef, + AnalysisRegistry analysisRegistry + ) { super(blockFactory); this.channel = channel; this.aggregatorMode = aggregatorMode; + this.categorizeDef = categorizeDef; this.categorizer = new TokenListCategorizer.CloseableTokenListCategorizer( new CategorizationBytesRefHash(new BytesRefHash(2048, blockFactory.bigArrays())), CategorizationPartOfSpeechDictionary.getInstance(), - 0.70f + categorizeDef.similarityThreshold() / 100.0f ); if (aggregatorMode.isInputPartial() == false) { - CategorizationAnalyzer analyzer; + CategorizationAnalyzer categorizationAnalyzer; try { Objects.requireNonNull(analysisRegistry); - analyzer = new CategorizationAnalyzer(analysisRegistry, ANALYZER_CONFIG); - } catch (Exception e) { + CategorizationAnalyzerConfig config = categorizeDef.analyzer() == null + ? DEFAULT_ANALYZER_CONFIG + : new CategorizationAnalyzerConfig.Builder().setAnalyzer(categorizeDef.analyzer()).build(); + categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, config); + } catch (IOException e) { categorizer.close(); throw new RuntimeException(e); } - this.evaluator = new CategorizeEvaluator(analyzer); + this.evaluator = new CategorizeEvaluator(categorizationAnalyzer); } else { this.evaluator = null; } @@ -114,7 +124,7 @@ public IntVector nonEmpty() { @Override public BitArray seenGroupIds(BigArrays bigArrays) { - return new SeenGroupIds.Range(seenNull ? 0 : 1, Math.toIntExact(categorizer.getCategoryCount() + 1)).seenGroupIds(bigArrays); + return new Range(seenNull ? 0 : 1, Math.toIntExact(categorizer.getCategoryCount() + 1)).seenGroupIds(bigArrays); } @Override @@ -222,7 +232,7 @@ private Block buildFinalBlock() { try (BytesRefBlock.Builder result = blockFactory.newBytesRefBlockBuilder(categorizer.getCategoryCount())) { result.appendNull(); for (SerializableTokenListCategory category : categorizer.toCategoriesById()) { - scratch.copyChars(category.getRegex()); + scratch.copyChars(getKeyString(category)); result.appendBytesRef(scratch.get()); scratch.clear(); } @@ -232,7 +242,7 @@ private Block buildFinalBlock() { try (BytesRefVector.Builder result = blockFactory.newBytesRefVectorBuilder(categorizer.getCategoryCount())) { for (SerializableTokenListCategory category : categorizer.toCategoriesById()) { - scratch.copyChars(category.getRegex()); + scratch.copyChars(getKeyString(category)); result.appendBytesRef(scratch.get()); scratch.clear(); } @@ -240,6 +250,13 @@ private Block buildFinalBlock() { } } + private String getKeyString(SerializableTokenListCategory category) { + return switch (categorizeDef.outputFormat()) { + case REGEX -> category.getRegex(); + case TOKENS -> category.getKeyTokensString(); + }; + } + /** * Similar implementation to an Evaluator. */ diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java index 20874cb10ceb8..bb5f0dee8ca2d 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHash.java @@ -56,6 +56,8 @@ public class CategorizePackedValuesBlockHash extends BlockHash { int emitBatchSize ) { super(blockFactory); + assert specs.get(0).categorizeDef() != null; + this.specs = specs; this.aggregatorMode = aggregatorMode; blocks = new Block[specs.size()]; @@ -68,7 +70,13 @@ public class CategorizePackedValuesBlockHash extends BlockHash { boolean success = false; try { - categorizeBlockHash = new CategorizeBlockHash(blockFactory, specs.get(0).channel(), aggregatorMode, analysisRegistry); + categorizeBlockHash = new CategorizeBlockHash( + blockFactory, + specs.get(0).channel(), + aggregatorMode, + specs.get(0).categorizeDef(), + analysisRegistry + ); packedValuesBlockHash = new PackedValuesBlockHash(delegateSpecs, blockFactory, emitBatchSize); success = true; } finally { diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java index 842952f9ef8bd..9ce086307acee 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHashTests.java @@ -76,7 +76,13 @@ private void initAnalysisRegistry() throws IOException { ).getAnalysisRegistry(); } + private BlockHash.CategorizeDef getCategorizeDef() { + return new BlockHash.CategorizeDef(null, randomFrom(BlockHash.CategorizeDef.OutputFormat.values()), 70); + } + public void testCategorizeRaw() { + BlockHash.CategorizeDef categorizeDef = getCategorizeDef(); + final Page page; boolean withNull = randomBoolean(); final int positions = 7 + (withNull ? 1 : 0); @@ -98,7 +104,7 @@ public void testCategorizeRaw() { page = new Page(builder.build()); } - try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, analysisRegistry)) { + try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, categorizeDef, analysisRegistry)) { for (int i = randomInt(2); i < 3; i++) { hash.add(page, new GroupingAggregatorFunction.AddInput() { private void addBlock(int positionOffset, IntBlock groupIds) { @@ -137,7 +143,10 @@ public void close() { } }); - assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?"); + switch (categorizeDef.outputFormat()) { + case REGEX -> assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?"); + case TOKENS -> assertHashState(hash, withNull, "Connected to", "Connection error", "Disconnected"); + } } } finally { page.releaseBlocks(); @@ -145,6 +154,8 @@ public void close() { } public void testCategorizeRawMultivalue() { + BlockHash.CategorizeDef categorizeDef = getCategorizeDef(); + final Page page; boolean withNull = randomBoolean(); final int positions = 3 + (withNull ? 1 : 0); @@ -170,7 +181,7 @@ public void testCategorizeRawMultivalue() { page = new Page(builder.build()); } - try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, analysisRegistry)) { + try (var hash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.SINGLE, categorizeDef, analysisRegistry)) { for (int i = randomInt(2); i < 3; i++) { hash.add(page, new GroupingAggregatorFunction.AddInput() { private void addBlock(int positionOffset, IntBlock groupIds) { @@ -216,7 +227,10 @@ public void close() { } }); - assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?"); + switch (categorizeDef.outputFormat()) { + case REGEX -> assertHashState(hash, withNull, ".*?Connected.+?to.*?", ".*?Connection.+?error.*?", ".*?Disconnected.*?"); + case TOKENS -> assertHashState(hash, withNull, "Connected to", "Connection error", "Disconnected"); + } } } finally { page.releaseBlocks(); @@ -224,6 +238,8 @@ public void close() { } public void testCategorizeIntermediate() { + BlockHash.CategorizeDef categorizeDef = getCategorizeDef(); + Page page1; boolean withNull = randomBoolean(); int positions1 = 7 + (withNull ? 1 : 0); @@ -259,8 +275,8 @@ public void testCategorizeIntermediate() { // Fill intermediatePages with the intermediate state from the raw hashes try ( - BlockHash rawHash1 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, analysisRegistry); - BlockHash rawHash2 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, analysisRegistry); + BlockHash rawHash1 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, categorizeDef, analysisRegistry); + BlockHash rawHash2 = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.INITIAL, categorizeDef, analysisRegistry); ) { rawHash1.add(page1, new GroupingAggregatorFunction.AddInput() { private void addBlock(int positionOffset, IntBlock groupIds) { @@ -335,7 +351,7 @@ public void close() { page2.releaseBlocks(); } - try (var intermediateHash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.FINAL, null)) { + try (var intermediateHash = new CategorizeBlockHash(blockFactory, 0, AggregatorMode.FINAL, categorizeDef, null)) { intermediateHash.add(intermediatePage1, new GroupingAggregatorFunction.AddInput() { private void addBlock(int positionOffset, IntBlock groupIds) { List values = IntStream.range(0, groupIds.getPositionCount()) @@ -403,14 +419,24 @@ public void close() { } }); - assertHashState( - intermediateHash, - withNull, - ".*?Connected.+?to.*?", - ".*?Connection.+?error.*?", - ".*?Disconnected.*?", - ".*?System.+?shutdown.*?" - ); + switch (categorizeDef.outputFormat()) { + case REGEX -> assertHashState( + intermediateHash, + withNull, + ".*?Connected.+?to.*?", + ".*?Connection.+?error.*?", + ".*?Disconnected.*?", + ".*?System.+?shutdown.*?" + ); + case TOKENS -> assertHashState( + intermediateHash, + withNull, + "Connected to", + "Connection error", + "Disconnected", + "System shutdown" + ); + } } } finally { intermediatePage1.releaseBlocks(); @@ -419,6 +445,9 @@ public void close() { } public void testCategorize_withDriver() { + BlockHash.CategorizeDef categorizeDef = getCategorizeDef(); + BlockHash.GroupSpec groupSpec = new BlockHash.GroupSpec(0, ElementType.BYTES_REF, categorizeDef); + BigArrays bigArrays = new MockBigArrays(PageCacheRecycler.NON_RECYCLING_INSTANCE, ByteSizeValue.ofMb(256)).withCircuitBreaking(); CircuitBreaker breaker = bigArrays.breakerService().getBreaker(CircuitBreaker.REQUEST); DriverContext driverContext = new DriverContext(bigArrays, new BlockFactory(breaker, bigArrays)); @@ -477,7 +506,7 @@ public void testCategorize_withDriver() { new LocalSourceOperator(input1), List.of( new HashAggregationOperator.HashAggregationOperatorFactory( - List.of(makeGroupSpec()), + List.of(groupSpec), AggregatorMode.INITIAL, List.of( new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(AggregatorMode.INITIAL, List.of(1)), @@ -496,7 +525,7 @@ public void testCategorize_withDriver() { new LocalSourceOperator(input2), List.of( new HashAggregationOperator.HashAggregationOperatorFactory( - List.of(makeGroupSpec()), + List.of(groupSpec), AggregatorMode.INITIAL, List.of( new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(AggregatorMode.INITIAL, List.of(1)), @@ -517,7 +546,7 @@ public void testCategorize_withDriver() { new CannedSourceOperator(intermediateOutput.iterator()), List.of( new HashAggregationOperator.HashAggregationOperatorFactory( - List.of(makeGroupSpec()), + List.of(groupSpec), AggregatorMode.FINAL, List.of( new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(AggregatorMode.FINAL, List.of(1, 2)), @@ -544,23 +573,36 @@ public void testCategorize_withDriver() { sums.put(outputTexts.getBytesRef(i, new BytesRef()).utf8ToString(), outputSums.getLong(i)); maxs.put(outputTexts.getBytesRef(i, new BytesRef()).utf8ToString(), outputMaxs.getLong(i)); } + List keys = switch (categorizeDef.outputFormat()) { + case REGEX -> List.of( + ".*?aaazz.*?", + ".*?bbbzz.*?", + ".*?ccczz.*?", + ".*?dddzz.*?", + ".*?eeezz.*?", + ".*?words.+?words.+?words.+?goodbye.*?", + ".*?words.+?words.+?words.+?hello.*?" + ); + case TOKENS -> List.of("aaazz", "bbbzz", "ccczz", "dddzz", "eeezz", "words words words goodbye", "words words words hello"); + }; + assertThat( sums, equalTo( Map.of( - ".*?aaazz.*?", + keys.get(0), 1L, - ".*?bbbzz.*?", + keys.get(1), 2L, - ".*?ccczz.*?", + keys.get(2), 33L, - ".*?dddzz.*?", + keys.get(3), 44L, - ".*?eeezz.*?", + keys.get(4), 5L, - ".*?words.+?words.+?words.+?goodbye.*?", + keys.get(5), 8888L, - ".*?words.+?words.+?words.+?hello.*?", + keys.get(6), 999L ) ) @@ -569,19 +611,19 @@ public void testCategorize_withDriver() { maxs, equalTo( Map.of( - ".*?aaazz.*?", + keys.get(0), 1L, - ".*?bbbzz.*?", + keys.get(1), 2L, - ".*?ccczz.*?", + keys.get(2), 30L, - ".*?dddzz.*?", + keys.get(3), 40L, - ".*?eeezz.*?", + keys.get(4), 5L, - ".*?words.+?words.+?words.+?goodbye.*?", + keys.get(5), 8000L, - ".*?words.+?words.+?words.+?hello.*?", + keys.get(6), 900L ) ) @@ -589,10 +631,6 @@ public void testCategorize_withDriver() { Releasables.close(() -> Iterators.map(finalOutput.iterator(), (Page p) -> p::releaseBlocks)); } - private BlockHash.GroupSpec makeGroupSpec() { - return new BlockHash.GroupSpec(0, ElementType.BYTES_REF, true); - } - private void assertHashState(CategorizeBlockHash hash, boolean withNull, String... expectedKeys) { // Check the keys Block[] blocks = null; diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java index 734b0660d24a3..d0eb89eafd841 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/CategorizePackedValuesBlockHashTests.java @@ -74,10 +74,15 @@ public void testCategorize_withDriver() { DriverContext driverContext = new DriverContext(bigArrays, new BlockFactory(breaker, bigArrays)); boolean withNull = randomBoolean(); boolean withMultivalues = randomBoolean(); + BlockHash.CategorizeDef categorizeDef = new BlockHash.CategorizeDef( + null, + randomFrom(BlockHash.CategorizeDef.OutputFormat.values()), + 70 + ); List groupSpecs = List.of( - new BlockHash.GroupSpec(0, ElementType.BYTES_REF, true), - new BlockHash.GroupSpec(1, ElementType.INT, false) + new BlockHash.GroupSpec(0, ElementType.BYTES_REF, categorizeDef), + new BlockHash.GroupSpec(1, ElementType.INT, null) ); LocalSourceOperator.BlockSupplier input1 = () -> { @@ -218,8 +223,12 @@ public void testCategorize_withDriver() { } Releasables.close(() -> Iterators.map(finalOutput.iterator(), (Page p) -> p::releaseBlocks)); + List keys = switch (categorizeDef.outputFormat()) { + case REGEX -> List.of(".*?connected.+?to.*?", ".*?connection.+?error.*?", ".*?disconnected.*?"); + case TOKENS -> List.of("connected to", "connection error", "disconnected"); + }; Map>> expectedResult = Map.of( - ".*?connected.+?to.*?", + keys.get(0), Map.of( 7, Set.of("connected to 1.1.1", "connected to 1.1.2", "connected to 1.1.4", "connected to 2.1.2"), @@ -228,9 +237,9 @@ public void testCategorize_withDriver() { 111, Set.of("connected to 2.1.1") ), - ".*?connection.+?error.*?", + keys.get(1), Map.of(7, Set.of("connection error"), 42, Set.of("connection error")), - ".*?disconnected.*?", + keys.get(2), Map.of(7, Set.of("disconnected")) ); if (withNull) { diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java index f96b9d26f075c..0ebfa7e72b805 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/aggregation/blockhash/TopNBlockHashTests.java @@ -363,7 +363,7 @@ private void hashBatchesCallbackOnLast(Consumer callback, Block[].. private BlockHash buildBlockHash(int emitBatchSize, Block... values) { List specs = new ArrayList<>(values.length); for (int c = 0; c < values.length; c++) { - specs.add(new BlockHash.GroupSpec(c, values[c].elementType(), false, topNDef(c))); + specs.add(new BlockHash.GroupSpec(c, values[c].elementType(), null, topNDef(c))); } assert forcePackedHash == false : "Packed TopN hash not implemented yet"; /*return forcePackedHash diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java index 106b9613d7bb2..0e9c0e33d22cd 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/HashAggregationOperatorTests.java @@ -113,7 +113,7 @@ public void testTopNNullsLast() { try ( var operator = new HashAggregationOperator.HashAggregationOperatorFactory( - List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, false, new BlockHash.TopNDef(0, ascOrder, false, 3))), + List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, null, new BlockHash.TopNDef(0, ascOrder, false, 3))), mode, List.of( new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(mode, aggregatorChannels), @@ -190,7 +190,7 @@ public void testTopNNullsFirst() { try ( var operator = new HashAggregationOperator.HashAggregationOperatorFactory( - List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, false, new BlockHash.TopNDef(0, ascOrder, true, 3))), + List.of(new BlockHash.GroupSpec(groupChannel, ElementType.LONG, null, new BlockHash.TopNDef(0, ascOrder, true, 3))), mode, List.of( new SumLongAggregatorFunctionSupplier().groupingAggregatorFactory(mode, aggregatorChannels), diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec index 7168ca3dc398f..be46e68a8b08a 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec @@ -397,7 +397,7 @@ FROM sample_data ; COUNT():long | SUM(event_duration):long | category:keyword - 7 | 23231327 | null + 7 | 23231327 | null ; on null row @@ -800,3 +800,82 @@ COUNT():long | VALUES(str):keyword | category:keyword | str:keyword 1 | [a, b, c] | null | b 1 | [a, b, c] | null | c ; + +with option output_format regex +required_capability: categorize_options + +FROM sample_data + | STATS count=COUNT() + BY category=CATEGORIZE(message, {"output_format": "regex"}) + | SORT count DESC, category +; + +count:long | category:keyword + 3 | .*?Connected.+?to.*? + 3 | .*?Connection.+?error.*? + 1 | .*?Disconnected.*? +; + +with option output_format tokens +required_capability: categorize_options + +FROM sample_data + | STATS count=COUNT() + BY category=CATEGORIZE(message, {"output_format": "tokens"}) + | SORT count DESC, category +; + +count:long | category:keyword + 3 | Connected to + 3 | Connection error + 1 | Disconnected +; + +with option similarity_threshold +required_capability: categorize_options + +FROM sample_data + | STATS count=COUNT() + BY category=CATEGORIZE(message, {"similarity_threshold": 99}) + | SORT count DESC, category +; + +count:long | category:keyword +3 | .*?Connection.+?error.*? +1 | .*?Connected.+?to.+?10\.1\.0\.1.*? +1 | .*?Connected.+?to.+?10\.1\.0\.2.*? +1 | .*?Connected.+?to.+?10\.1\.0\.3.*? +1 | .*?Disconnected.*? +; + +with option analyzer +required_capability: categorize_options + +FROM sample_data + | STATS count=COUNT() + BY category=CATEGORIZE(message, {"analyzer": "stop"}) + | SORT count DESC, category +; + +count:long | category:keyword +3 | .*?connected.*? +3 | .*?connection.+?error.*? +1 | .*?disconnected.*? +; + +with all options +required_capability: categorize_options + +FROM sample_data + | STATS count=COUNT() + BY category=CATEGORIZE(message, {"analyzer": "whitespace", "similarity_threshold": 100, "output_format": "tokens"}) + | SORT count DESC, category +; + +count:long | category:keyword +3 | Connection error +1 | Connected to 10.1.0.1 +1 | Connected to 10.1.0.2 +1 | Connected to 10.1.0.3 +1 | Disconnected +; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index f4f6bd8a12d79..41a92214406bb 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -1254,10 +1254,12 @@ public enum Cap { * FUSE command */ FUSE(Build.current().isSnapshot()), + /** * Support improved behavior for LIKE operator when used with index fields. */ LIKE_ON_INDEX_FIELDS, + /** * Support avg with aggregate metric doubles */ @@ -1274,10 +1276,15 @@ public enum Cap { */ FAIL_IF_ALL_SHARDS_FAIL(Build.current().isSnapshot()), - /* + /** * Cosine vector similarity function */ - COSINE_VECTOR_SIMILARITY_FUNCTION(Build.current().isSnapshot()); + COSINE_VECTOR_SIMILARITY_FUNCTION(Build.current().isSnapshot()), + + /** + * Support for the options field of CATEGORIZE. + */ + CATEGORIZE_OPTIONS; private final boolean enabled; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java new file mode 100644 index 0000000000000..891d8f1e6c264 --- /dev/null +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/Options.java @@ -0,0 +1,107 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.expression.function; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.xpack.esql.core.InvalidArgumentException; +import org.elasticsearch.xpack.esql.core.expression.EntryExpression; +import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.Literal; +import org.elasticsearch.xpack.esql.core.expression.MapExpression; +import org.elasticsearch.xpack.esql.core.expression.TypeResolutions; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.core.type.DataType; +import org.elasticsearch.xpack.esql.core.type.DataTypeConverter; + +import java.util.HashMap; +import java.util.Map; +import java.util.function.Consumer; + +import static org.elasticsearch.common.logging.LoggerMessageFormat.format; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isFoldable; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isMapExpression; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull; + +public class Options { + + public static Expression.TypeResolution resolve( + Expression options, + Source source, + TypeResolutions.ParamOrdinal paramOrdinal, + Map allowedOptions + ) { + return resolve(options, source, paramOrdinal, allowedOptions, null); + } + + public static Expression.TypeResolution resolve( + Expression options, + Source source, + TypeResolutions.ParamOrdinal paramOrdinal, + Map allowedOptions, + Consumer> verifyOptions + ) { + if (options != null) { + Expression.TypeResolution resolution = isNotNull(options, source.text(), paramOrdinal); + if (resolution.unresolved()) { + return resolution; + } + // MapExpression does not have a DataType associated with it + resolution = isMapExpression(options, source.text(), paramOrdinal); + if (resolution.unresolved()) { + return resolution; + } + try { + Map optionsMap = new HashMap<>(); + populateMap((MapExpression) options, optionsMap, source, paramOrdinal, allowedOptions); + if (verifyOptions != null) { + verifyOptions.accept(optionsMap); + } + } catch (InvalidArgumentException e) { + return new Expression.TypeResolution(e.getMessage()); + } + } + return Expression.TypeResolution.TYPE_RESOLVED; + } + + public static void populateMap( + final MapExpression options, + final Map optionsMap, + final Source source, + final TypeResolutions.ParamOrdinal paramOrdinal, + final Map allowedOptions + ) throws InvalidArgumentException { + for (EntryExpression entry : options.entryExpressions()) { + Expression optionExpr = entry.key(); + Expression valueExpr = entry.value(); + Expression.TypeResolution resolution = isFoldable(optionExpr, source.text(), paramOrdinal).and( + isFoldable(valueExpr, source.text(), paramOrdinal) + ); + if (resolution.unresolved()) { + throw new InvalidArgumentException(resolution.message()); + } + Object optionExprLiteral = ((Literal) optionExpr).value(); + Object valueExprLiteral = ((Literal) valueExpr).value(); + String optionName = optionExprLiteral instanceof BytesRef br ? br.utf8ToString() : optionExprLiteral.toString(); + String optionValue = valueExprLiteral instanceof BytesRef br ? br.utf8ToString() : valueExprLiteral.toString(); + // validate the optionExpr is supported + DataType dataType = allowedOptions.get(optionName); + if (dataType == null) { + throw new InvalidArgumentException( + format(null, "Invalid option [{}] in [{}], expected one of {}", optionName, source.text(), allowedOptions.keySet()) + ); + } + try { + optionsMap.put(optionName, DataTypeConverter.convert(optionValue, dataType)); + } catch (InvalidArgumentException e) { + throw new InvalidArgumentException( + format(null, "Invalid option [{}] in [{}], {}", optionName, source.text(), e.getMessage()) + ); + } + } + } +} diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/FullTextFunction.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/FullTextFunction.java index c347340c25050..b5378db783f46 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/FullTextFunction.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/FullTextFunction.java @@ -7,7 +7,6 @@ package org.elasticsearch.xpack.esql.expression.function.fulltext; -import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.compute.lucene.LuceneQueryEvaluator.ShardConfig; import org.elasticsearch.compute.lucene.LuceneQueryExpressionEvaluator; @@ -20,20 +19,15 @@ import org.elasticsearch.xpack.esql.capabilities.PostAnalysisPlanVerificationAware; import org.elasticsearch.xpack.esql.capabilities.TranslationAware; import org.elasticsearch.xpack.esql.common.Failures; -import org.elasticsearch.xpack.esql.core.InvalidArgumentException; -import org.elasticsearch.xpack.esql.core.expression.EntryExpression; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; import org.elasticsearch.xpack.esql.core.expression.FoldContext; -import org.elasticsearch.xpack.esql.core.expression.Literal; -import org.elasticsearch.xpack.esql.core.expression.MapExpression; import org.elasticsearch.xpack.esql.core.expression.Nullability; import org.elasticsearch.xpack.esql.core.expression.TypeResolutions; import org.elasticsearch.xpack.esql.core.expression.function.Function; import org.elasticsearch.xpack.esql.core.querydsl.query.Query; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.core.type.DataType; -import org.elasticsearch.xpack.esql.core.type.DataTypeConverter; import org.elasticsearch.xpack.esql.core.type.MultiTypeEsField; import org.elasticsearch.xpack.esql.evaluator.mapper.EvaluatorMapper; import org.elasticsearch.xpack.esql.expression.function.scalar.convert.AbstractConvertFunction; @@ -55,17 +49,12 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; -import java.util.Map; import java.util.Objects; import java.util.function.BiConsumer; import java.util.function.Predicate; -import static org.elasticsearch.common.logging.LoggerMessageFormat.format; import static org.elasticsearch.xpack.esql.common.Failure.fail; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.DEFAULT; -import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isFoldable; -import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isMapExpression; -import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNullAndFoldable; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; @@ -409,66 +398,6 @@ public ScoreOperator.ExpressionScorer.Factory toScorer(ToScorer toScorer) { return new LuceneQueryScoreEvaluator.Factory(shardConfigs); } - protected static void populateOptionsMap( - final MapExpression options, - final Map optionsMap, - final TypeResolutions.ParamOrdinal paramOrdinal, - final String sourceText, - final Map allowedOptions - ) throws InvalidArgumentException { - for (EntryExpression entry : options.entryExpressions()) { - Expression optionExpr = entry.key(); - Expression valueExpr = entry.value(); - TypeResolution resolution = isFoldable(optionExpr, sourceText, paramOrdinal).and( - isFoldable(valueExpr, sourceText, paramOrdinal) - ); - if (resolution.unresolved()) { - throw new InvalidArgumentException(resolution.message()); - } - Object optionExprLiteral = ((Literal) optionExpr).value(); - Object valueExprLiteral = ((Literal) valueExpr).value(); - String optionName = optionExprLiteral instanceof BytesRef br ? br.utf8ToString() : optionExprLiteral.toString(); - String optionValue = valueExprLiteral instanceof BytesRef br ? br.utf8ToString() : valueExprLiteral.toString(); - // validate the optionExpr is supported - DataType dataType = allowedOptions.get(optionName); - if (dataType == null) { - throw new InvalidArgumentException( - format(null, "Invalid option [{}] in [{}], expected one of {}", optionName, sourceText, allowedOptions.keySet()) - ); - } - try { - optionsMap.put(optionName, DataTypeConverter.convert(optionValue, dataType)); - } catch (InvalidArgumentException e) { - throw new InvalidArgumentException(format(null, "Invalid option [{}] in [{}], {}", optionName, sourceText, e.getMessage())); - } - } - } - - protected TypeResolution resolveOptions(Expression options, TypeResolutions.ParamOrdinal paramOrdinal) { - if (options != null) { - TypeResolution resolution = isNotNull(options, sourceText(), paramOrdinal); - if (resolution.unresolved()) { - return resolution; - } - // MapExpression does not have a DataType associated with it - resolution = isMapExpression(options, sourceText(), paramOrdinal); - if (resolution.unresolved()) { - return resolution; - } - - try { - resolvedOptions(); - } catch (InvalidArgumentException e) { - return new TypeResolution(e.getMessage()); - } - } - return TypeResolution.TYPE_RESOLVED; - } - - protected Map resolvedOptions() throws InvalidArgumentException { - return Map.of(); - } - // TODO: this should likely be replaced by calls to FieldAttribute#fieldName; the MultiTypeEsField case looks // wrong if `fieldAttribute` is a subfield, e.g. `parent.child` - multiTypeEsField#getName will just return `child`. public static String getNameFromFieldAttribute(FieldAttribute fieldAttribute) { diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/Match.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/Match.java index 743263a878552..5c5a46fd2f759 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/Match.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/Match.java @@ -33,6 +33,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.MapParam; import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; +import org.elasticsearch.xpack.esql.expression.function.Options; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates; @@ -298,7 +299,9 @@ public final void writeTo(StreamOutput out) throws IOException { @Override protected TypeResolution resolveParams() { - return resolveField().and(resolveQuery()).and(resolveOptions(options(), THIRD)).and(checkParamCompatibility()); + return resolveField().and(resolveQuery()) + .and(Options.resolve(options(), source(), THIRD, ALLOWED_OPTIONS)) + .and(checkParamCompatibility()); } private TypeResolution resolveField() { @@ -342,11 +345,6 @@ private TypeResolution checkParamCompatibility() { return new TypeResolution(formatIncompatibleTypesMessage(fieldType, queryType, sourceText())); } - @Override - protected Map resolvedOptions() { - return matchQueryOptions(); - } - private Map matchQueryOptions() throws InvalidArgumentException { if (options() == null) { return Map.of(LENIENT_FIELD.getPreferredName(), true); @@ -356,7 +354,7 @@ private Map matchQueryOptions() throws InvalidArgumentException // Match is lenient by default to avoid failing on incompatible types matchOptions.put(LENIENT_FIELD.getPreferredName(), true); - populateOptionsMap((MapExpression) options(), matchOptions, SECOND, sourceText(), ALLOWED_OPTIONS); + Options.populateMap((MapExpression) options(), matchOptions, source(), SECOND, ALLOWED_OPTIONS); return matchOptions; } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MatchPhrase.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MatchPhrase.java index a41a9792f7943..4ed0e16ab5b4a 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MatchPhrase.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MatchPhrase.java @@ -30,6 +30,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.MapParam; import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; +import org.elasticsearch.xpack.esql.expression.function.Options; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates; @@ -187,7 +188,7 @@ public final void writeTo(StreamOutput out) throws IOException { @Override protected TypeResolution resolveParams() { - return resolveField().and(resolveQuery()).and(resolveOptions(options(), THIRD)); + return resolveField().and(resolveQuery()).and(Options.resolve(options(), source(), THIRD, ALLOWED_OPTIONS)); } private TypeResolution resolveField() { @@ -200,18 +201,13 @@ private TypeResolution resolveQuery() { ); } - @Override - protected Map resolvedOptions() throws InvalidArgumentException { - return matchPhraseQueryOptions(); - } - private Map matchPhraseQueryOptions() throws InvalidArgumentException { if (options() == null) { return Map.of(); } Map matchPhraseOptions = new HashMap<>(); - populateOptionsMap((MapExpression) options(), matchPhraseOptions, SECOND, sourceText(), ALLOWED_OPTIONS); + Options.populateMap((MapExpression) options(), matchPhraseOptions, source(), SECOND, ALLOWED_OPTIONS); return matchPhraseOptions; } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MultiMatch.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MultiMatch.java index 1178178c432fc..3e9fed6be850d 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MultiMatch.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/MultiMatch.java @@ -29,6 +29,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.MapParam; import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; +import org.elasticsearch.xpack.esql.expression.function.Options; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates; @@ -368,7 +369,7 @@ private Map getOptions() throws InvalidArgumentException { return options; } - Match.populateOptionsMap((MapExpression) options(), options, THIRD, sourceText(), OPTIONS); + Options.populateMap((MapExpression) options(), options, source(), THIRD, OPTIONS); return options; } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/QueryString.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/QueryString.java index a4c1b1f12fb56..7285f19fc5aa7 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/QueryString.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/fulltext/QueryString.java @@ -26,6 +26,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.MapParam; import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; +import org.elasticsearch.xpack.esql.expression.function.Options; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates; @@ -321,18 +322,13 @@ private Map queryStringOptions() throws InvalidArgumentException } Map matchOptions = new HashMap<>(); - populateOptionsMap((MapExpression) options(), matchOptions, SECOND, sourceText(), ALLOWED_OPTIONS); + Options.populateMap((MapExpression) options(), matchOptions, source(), SECOND, ALLOWED_OPTIONS); return matchOptions; } - @Override - protected Map resolvedOptions() { - return queryStringOptions(); - } - @Override protected TypeResolution resolveParams() { - return resolveQuery().and(resolveOptions(options(), SECOND)); + return resolveQuery().and(Options.resolve(options(), source(), SECOND, ALLOWED_OPTIONS)); } @Override diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java index 15b4621589457..75918091f9ecd 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Categorize.java @@ -7,13 +7,18 @@ package org.elasticsearch.xpack.esql.expression.function.grouping; +import org.elasticsearch.TransportVersions; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.compute.aggregation.blockhash.BlockHash.CategorizeDef; +import org.elasticsearch.compute.aggregation.blockhash.BlockHash.CategorizeDef.OutputFormat; import org.elasticsearch.license.XPackLicenseState; import org.elasticsearch.xpack.esql.LicenseAware; import org.elasticsearch.xpack.esql.SupportsObservabilityTier; +import org.elasticsearch.xpack.esql.core.InvalidArgumentException; import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.MapExpression; import org.elasticsearch.xpack.esql.core.expression.Nullability; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; import org.elasticsearch.xpack.esql.core.tree.Source; @@ -21,16 +26,29 @@ import org.elasticsearch.xpack.esql.expression.function.Example; import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.FunctionType; +import org.elasticsearch.xpack.esql.expression.function.MapParam; +import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; +import org.elasticsearch.xpack.esql.expression.function.Options; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import org.elasticsearch.xpack.ml.MachineLearning; import java.io.IOException; +import java.util.HashMap; import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; +import static java.util.Map.entry; +import static org.elasticsearch.common.logging.LoggerMessageFormat.format; +import static org.elasticsearch.compute.aggregation.blockhash.BlockHash.CategorizeDef.OutputFormat.REGEX; import static org.elasticsearch.xpack.esql.SupportsObservabilityTier.ObservabilityTier.COMPLETE; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.DEFAULT; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; +import static org.elasticsearch.xpack.esql.core.type.DataType.INTEGER; +import static org.elasticsearch.xpack.esql.core.type.DataType.KEYWORD; /** * Categorizes text messages. @@ -42,14 +60,23 @@ *

*/ @SupportsObservabilityTier(tier = COMPLETE) -public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction implements LicenseAware { +public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction implements OptionalArgument, LicenseAware { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry( Expression.class, "Categorize", Categorize::new ); + private static final String ANALYZER = "analyzer"; + private static final String OUTPUT_FORMAT = "output_format"; + private static final String SIMILARITY_THRESHOLD = "similarity_threshold"; + + private static final Map ALLOWED_OPTIONS = new TreeMap<>( + Map.ofEntries(entry(ANALYZER, KEYWORD), entry(OUTPUT_FORMAT, KEYWORD), entry(SIMILARITY_THRESHOLD, INTEGER)) + ); + private final Expression field; + private final Expression options; @FunctionInfo( returnType = "keyword", @@ -70,21 +97,56 @@ public class Categorize extends GroupingFunction.NonEvaluatableGroupingFunction ) public Categorize( Source source, - @Param(name = "field", type = { "text", "keyword" }, description = "Expression to categorize") Expression field - + @Param(name = "field", type = { "text", "keyword" }, description = "Expression to categorize") Expression field, + @MapParam( + name = "options", + description = "(Optional) Categorize additional options as <>.", + params = { + @MapParam.MapParamEntry( + name = ANALYZER, + type = "keyword", + valueHint = { "standard" }, + description = "Analyzer used to convert the field into tokens for text categorization." + ), + @MapParam.MapParamEntry( + name = OUTPUT_FORMAT, + type = "keyword", + valueHint = { "regex", "tokens" }, + description = "The output format of the categories. Defaults to regex." + ), + @MapParam.MapParamEntry( + name = SIMILARITY_THRESHOLD, + type = "integer", + valueHint = { "70" }, + description = "The minimum percentage of token weight that must match for text to be added to the category bucket. " + + "Must be between 1 and 100. The larger the value the narrower the categories. " + + "Larger values will increase memory usage and create narrower categories. Defaults to 70." + ), }, + optional = true + ) Expression options ) { - super(source, List.of(field)); + super(source, options == null ? List.of(field) : List.of(field, options)); this.field = field; + this.options = options; } private Categorize(StreamInput in) throws IOException { - this(Source.readFrom((PlanStreamInput) in), in.readNamedWriteable(Expression.class)); + this( + Source.readFrom((PlanStreamInput) in), + in.readNamedWriteable(Expression.class), + in.getTransportVersion().onOrAfter(TransportVersions.ESQL_CATEGORIZE_OPTIONS) + ? in.readOptionalNamedWriteable(Expression.class) + : null + ); } @Override public void writeTo(StreamOutput out) throws IOException { source().writeTo(out); out.writeNamedWriteable(field); + if (out.getTransportVersion().onOrAfter(TransportVersions.ESQL_CATEGORIZE_OPTIONS)) { + out.writeOptionalNamedWriteable(options); + } } @Override @@ -107,7 +169,48 @@ public Nullability nullable() { @Override protected TypeResolution resolveType() { - return isString(field(), sourceText(), DEFAULT); + return isString(field(), sourceText(), DEFAULT).and( + Options.resolve(options, source(), SECOND, ALLOWED_OPTIONS, this::verifyOptions) + ); + } + + private void verifyOptions(Map optionsMap) { + if (options == null) { + return; + } + Integer similarityThreshold = (Integer) optionsMap.get(SIMILARITY_THRESHOLD); + if (similarityThreshold != null) { + if (similarityThreshold <= 0 || similarityThreshold > 100) { + throw new InvalidArgumentException( + format("invalid similarity threshold [{}], expecting a number between 1 and 100, inclusive", similarityThreshold) + ); + } + } + String outputFormat = (String) optionsMap.get(OUTPUT_FORMAT); + if (outputFormat != null) { + try { + OutputFormat.valueOf(outputFormat.toUpperCase(Locale.ROOT)); + } catch (IllegalArgumentException e) { + throw new InvalidArgumentException( + format(null, "invalid output format [{}], expecting one of [REGEX, TOKENS]", outputFormat) + ); + } + } + } + + public CategorizeDef categorizeDef() { + Map optionsMap = new HashMap<>(); + if (options != null) { + Options.populateMap((MapExpression) options, optionsMap, source(), SECOND, ALLOWED_OPTIONS); + } + Integer similarityThreshold = (Integer) optionsMap.get(SIMILARITY_THRESHOLD); + String outputFormatString = (String) optionsMap.get(OUTPUT_FORMAT); + OutputFormat outputFormat = outputFormatString == null ? null : OutputFormat.valueOf(outputFormatString.toUpperCase(Locale.ROOT)); + return new CategorizeDef( + (String) optionsMap.get("analyzer"), + outputFormat == null ? REGEX : outputFormat, + similarityThreshold == null ? 70 : similarityThreshold + ); } @Override @@ -117,12 +220,12 @@ public DataType dataType() { @Override public Categorize replaceChildren(List newChildren) { - return new Categorize(source(), newChildren.get(0)); + return new Categorize(source(), newChildren.get(0), newChildren.size() > 1 ? newChildren.get(1) : null); } @Override protected NodeInfo info() { - return NodeInfo.create(this, Categorize::new, field); + return NodeInfo.create(this, Categorize::new, field, options); } public Expression field() { diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java index 61528521c3749..cab5ec862d7f5 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/vector/Knn.java @@ -30,6 +30,7 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.MapParam; import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; +import org.elasticsearch.xpack.esql.expression.function.Options; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.expression.function.fulltext.FullTextFunction; import org.elasticsearch.xpack.esql.expression.function.fulltext.Match; @@ -53,10 +54,10 @@ import static org.elasticsearch.search.vectors.KnnVectorQueryBuilder.NUM_CANDS_FIELD; import static org.elasticsearch.search.vectors.KnnVectorQueryBuilder.VECTOR_SIMILARITY_FIELD; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FOURTH; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.THIRD; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isFoldable; -import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isMapExpression; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNullAndFoldable; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isType; @@ -198,7 +199,7 @@ public DataType dataType() { @Override protected TypeResolution resolveParams() { - return resolveField().and(resolveQuery()).and(resolveK()).and(resolveOptions()); + return resolveField().and(resolveQuery()).and(resolveK()).and(Options.resolve(options(), source(), FOURTH, ALLOWED_OPTIONS)); } private TypeResolution resolveField() { @@ -221,37 +222,6 @@ private TypeResolution resolveK() { .and(isNotNull(k(), sourceText(), THIRD)); } - private TypeResolution resolveOptions() { - if (options() != null) { - TypeResolution resolution = isNotNull(options(), sourceText(), TypeResolutions.ParamOrdinal.FOURTH); - if (resolution.unresolved()) { - return resolution; - } - // MapExpression does not have a DataType associated with it - resolution = isMapExpression(options(), sourceText(), TypeResolutions.ParamOrdinal.FOURTH); - if (resolution.unresolved()) { - return resolution; - } - - try { - knnQueryOptions(); - } catch (InvalidArgumentException e) { - return new TypeResolution(e.getMessage()); - } - } - return TypeResolution.TYPE_RESOLVED; - } - - private Map knnQueryOptions() throws InvalidArgumentException { - if (options() == null) { - return Map.of(); - } - - Map matchOptions = new HashMap<>(); - populateOptionsMap((MapExpression) options(), matchOptions, TypeResolutions.ParamOrdinal.FOURTH, sourceText(), ALLOWED_OPTIONS); - return matchOptions; - } - @Override public Expression replaceQueryBuilder(QueryBuilder queryBuilder) { return new Knn(source(), field(), query(), k(), options(), queryBuilder, filterExpressions()); @@ -307,7 +277,7 @@ public Expression withFilters(List filterExpressions) { private Map queryOptions() throws InvalidArgumentException { Map options = new HashMap<>(); if (options() != null) { - populateOptionsMap((MapExpression) options(), options, TypeResolutions.ParamOrdinal.FOURTH, sourceText(), ALLOWED_OPTIONS); + Options.populateMap((MapExpression) options(), options, source(), FOURTH, ALLOWED_OPTIONS); } return options; } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java index dd7ee26aa84bd..8fe9ccc18c006 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/ReplaceAggregateNestedExpressionWithEval.java @@ -10,6 +10,7 @@ import org.elasticsearch.xpack.esql.core.expression.Alias; import org.elasticsearch.xpack.esql.core.expression.Attribute; import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.MapExpression; import org.elasticsearch.xpack.esql.core.expression.NamedExpression; import org.elasticsearch.xpack.esql.core.util.Holder; import org.elasticsearch.xpack.esql.expression.function.aggregate.AggregateFunction; @@ -137,13 +138,13 @@ private static Expression transformNonEvaluatableGroupingFunction( List newChildren = new ArrayList<>(gf.children().size()); for (Expression ex : gf.children()) { - if (ex instanceof Attribute == false) { // TODO: foldables shouldn't require eval'ing either + if (ex instanceof Attribute || ex instanceof MapExpression) { + newChildren.add(ex); + } else { // TODO: foldables shouldn't require eval'ing either var alias = new Alias(ex.source(), syntheticName(ex, gf, counter++), ex, null, true); evals.add(alias); newChildren.add(alias.toAttribute()); childrenChanged = true; - } else { - newChildren.add(ex); } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java index a5d19fcc3fb14..e45fe2b0e81d8 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AbstractPhysicalOperationProviders.java @@ -343,8 +343,12 @@ BlockHash.GroupSpec toHashGroupSpec() { if (channel == null) { throw new EsqlIllegalArgumentException("planned to use ordinals but tried to use the hash instead"); } - - return new BlockHash.GroupSpec(channel, elementType(), Alias.unwrap(expression) instanceof Categorize, null); + return new BlockHash.GroupSpec( + channel, + elementType(), + Alias.unwrap(expression) instanceof Categorize categorize ? categorize.categorizeDef() : null, + null + ); } ElementType elementType() { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 4951479514b72..5d4260eb4ee66 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -1972,6 +1972,57 @@ public void testCategorizeWithFilteredAggregations() { ); } + public void testCategorizeInvalidOptionsField() { + assumeTrue("categorize options must be enabled", EsqlCapabilities.Cap.CATEGORIZE_OPTIONS.isEnabled()); + + assertEquals( + "1:31: second argument of [CATEGORIZE(last_name, first_name)] must be a map expression, received [first_name]", + error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, first_name)") + ); + assertEquals( + "1:31: Invalid option [blah] in [CATEGORIZE(last_name, { \"blah\": 42 })], " + + "expected one of [analyzer, output_format, similarity_threshold]", + error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"blah\": 42 })") + ); + } + + public void testCategorizeOptionOutputFormat() { + assumeTrue("categorize options must be enabled", EsqlCapabilities.Cap.CATEGORIZE_OPTIONS.isEnabled()); + + query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"regex\" })"); + query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"REGEX\" })"); + query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"tokens\" })"); + query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"ToKeNs\" })"); + assertEquals( + "1:31: invalid output format [blah], expecting one of [REGEX, TOKENS]", + error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": \"blah\" })") + ); + assertEquals( + "1:31: invalid output format [42], expecting one of [REGEX, TOKENS]", + error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"output_format\": 42 })") + ); + } + + public void testCategorizeOptionSimilarityThreshold() { + assumeTrue("categorize options must be enabled", EsqlCapabilities.Cap.CATEGORIZE_OPTIONS.isEnabled()); + + query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 1 })"); + query("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 100 })"); + assertEquals( + "1:31: invalid similarity threshold [0], expecting a number between 1 and 100, inclusive", + error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 0 })") + ); + assertEquals( + "1:31: invalid similarity threshold [101], expecting a number between 1 and 100, inclusive", + error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": 101 })") + ); + assertEquals( + "1:31: Invalid option [similarity_threshold] in [CATEGORIZE(last_name, { \"similarity_threshold\": \"blah\" })], " + + "cannot cast [blah] to [integer]", + error("FROM test | STATS COUNT(*) BY CATEGORIZE(last_name, { \"similarity_threshold\": \"blah\" })") + ); + } + public void testChangePoint() { assumeTrue("change_point must be enabled", EsqlCapabilities.Cap.CHANGE_POINT.isEnabled()); var airports = AnalyzerTestUtils.analyzer(loadMapping("mapping-airports.json", "airports")); diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java index f674f9b2c3d72..97d5b8e3ece96 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeErrorTests.java @@ -27,7 +27,7 @@ protected List cases() { @Override protected Expression build(Source source, List args) { - return new Categorize(source, args.get(0)); + return new Categorize(source, args.get(0), args.size() > 1 ? args.get(1) : null); } @Override diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java index f69bb7eb3e7bb..296d624ee1777 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/CategorizeTests.java @@ -61,7 +61,7 @@ public static Iterable parameters() { @Override protected Expression build(Source source, List args) { - return new Categorize(source, args.get(0)); + return new Categorize(source, args.get(0), args.size() > 1 ? args.get(1) : null); } @Override diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java index 96e26fbd37a4c..ae30dce97ce5a 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/FoldNullTests.java @@ -269,7 +269,7 @@ public void testNullBucketGetsFolded() { } public void testNullCategorizeGroupingNotFolded() { - Categorize categorize = new Categorize(EMPTY, NULL); + Categorize categorize = new Categorize(EMPTY, NULL, NULL); assertEquals(categorize, foldNull(categorize)); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/SerializableTokenListCategory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/SerializableTokenListCategory.java index 5686f3734f36e..47e20d1a56bf2 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/SerializableTokenListCategory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/SerializableTokenListCategory.java @@ -162,6 +162,13 @@ public BytesRef[] getKeyTokens() { return Arrays.stream(keyTokenIndexes).mapToObj(index -> baseTokens[index]).toArray(BytesRef[]::new); } + public String getKeyTokensString() { + return Arrays.stream(keyTokenIndexes) + .mapToObj(index -> baseTokens[index]) + .map(BytesRef::utf8ToString) + .collect(Collectors.joining(" ")); + } + public String getRegex() { if (keyTokenIndexes.length == 0 || orderedCommonTokenBeginIndex == orderedCommonTokenEndIndex) { return ".*"; From 8d434cc0c45a7fc7077a73a10ead6e425a6ca717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lorenzo=20Dematt=C3=A9?= Date: Thu, 17 Jul 2025 11:37:09 +0200 Subject: [PATCH 5/7] Migrate x-pack-autoscaling REST tests (#131365) This PR migrates legacy rest tests in the x-pack autoscaling module --- .../internal/RestrictedBuildApiService.java | 2 -- x-pack/plugin/autoscaling/build.gradle | 23 +++++++++------ x-pack/plugin/autoscaling/qa/build.gradle | 0 .../plugin/autoscaling/qa/rest/build.gradle | 28 ------------------- .../xpack/autoscaling/AutoscalingRestIT.java | 19 +++++++++++-- .../resources}/autoscaling-roles.yml | 0 .../autoscaling/delete_autoscaling_policy.yml | 0 .../autoscaling/get_autoscaling_capacity.yml | 0 .../autoscaling/get_autoscaling_policy.yml | 0 .../autoscaling/put_autoscaling_policy.yml | 0 10 files changed, 32 insertions(+), 40 deletions(-) delete mode 100644 x-pack/plugin/autoscaling/qa/build.gradle delete mode 100644 x-pack/plugin/autoscaling/qa/rest/build.gradle rename x-pack/plugin/autoscaling/{qa/rest => }/src/yamlRestTest/java/org/elasticsearch/xpack/autoscaling/AutoscalingRestIT.java (67%) rename x-pack/plugin/autoscaling/{qa/rest => src/yamlRestTest/resources}/autoscaling-roles.yml (100%) rename x-pack/plugin/autoscaling/{qa/rest => }/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/delete_autoscaling_policy.yml (100%) rename x-pack/plugin/autoscaling/{qa/rest => }/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_capacity.yml (100%) rename x-pack/plugin/autoscaling/{qa/rest => }/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_policy.yml (100%) rename x-pack/plugin/autoscaling/{qa/rest => }/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/put_autoscaling_policy.yml (100%) diff --git a/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/RestrictedBuildApiService.java b/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/RestrictedBuildApiService.java index 4f3c4b3d94f68..205930133156c 100644 --- a/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/RestrictedBuildApiService.java +++ b/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/RestrictedBuildApiService.java @@ -56,8 +56,6 @@ private static ListMultimap, String> createLegacyRestTestBasePluginUsag map.put(LegacyRestTestBasePlugin.class, ":x-pack:qa:third-party:jira"); map.put(LegacyRestTestBasePlugin.class, ":x-pack:qa:third-party:pagerduty"); map.put(LegacyRestTestBasePlugin.class, ":x-pack:qa:third-party:slack"); - map.put(LegacyRestTestBasePlugin.class, ":x-pack:plugin:async-search:qa:rest"); - map.put(LegacyRestTestBasePlugin.class, ":x-pack:plugin:autoscaling:qa:rest"); map.put(LegacyRestTestBasePlugin.class, ":x-pack:plugin:deprecation:qa:early-deprecation-rest"); map.put(LegacyRestTestBasePlugin.class, ":x-pack:plugin:deprecation:qa:rest"); map.put(LegacyRestTestBasePlugin.class, ":x-pack:plugin:downsample:qa:with-security"); diff --git a/x-pack/plugin/autoscaling/build.gradle b/x-pack/plugin/autoscaling/build.gradle index 24400a0fc418e..22a43654fd602 100644 --- a/x-pack/plugin/autoscaling/build.gradle +++ b/x-pack/plugin/autoscaling/build.gradle @@ -1,5 +1,6 @@ apply plugin: 'elasticsearch.internal-es-plugin' apply plugin: 'elasticsearch.internal-cluster-test' +apply plugin: 'elasticsearch.internal-yaml-rest-test' esplugin { name = 'x-pack-autoscaling' @@ -15,15 +16,21 @@ base { dependencies { compileOnly project(path: xpackModule('core')) - testImplementation(testArtifact(project(xpackModule('core')))) - testImplementation project(':modules:data-streams') - testImplementation project(path: xpackModule('blob-cache')) - testImplementation project(path: xpackModule('searchable-snapshots')) - testImplementation project(path: xpackModule('ilm')) - testImplementation project(path: xpackModule('slm')) - testImplementation project(path: xpackModule('ccr')) + testImplementation testArtifact(project(':server')) + testImplementation testArtifact(project(xpackModule('core'))) testImplementation "com.fasterxml.jackson.core:jackson-databind:${versions.jackson}" + + internalClusterTestImplementation project(':modules:data-streams') + internalClusterTestImplementation project(xpackModule('blob-cache')) + internalClusterTestImplementation project(xpackModule("searchable-snapshots")) + internalClusterTestImplementation project(xpackModule('ilm')) + internalClusterTestImplementation project(xpackModule('slm')) + internalClusterTestImplementation project(xpackModule('ccr')) } -addQaCheckDependencies(project) +restResources { + restApi { + include '_common', 'autoscaling' + } +} diff --git a/x-pack/plugin/autoscaling/qa/build.gradle b/x-pack/plugin/autoscaling/qa/build.gradle deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/x-pack/plugin/autoscaling/qa/rest/build.gradle b/x-pack/plugin/autoscaling/qa/rest/build.gradle deleted file mode 100644 index 903e76fd986cf..0000000000000 --- a/x-pack/plugin/autoscaling/qa/rest/build.gradle +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. - */ - -apply plugin: 'elasticsearch.legacy-yaml-rest-test' -apply plugin: 'elasticsearch.legacy-yaml-rest-compat-test' - -dependencies { - yamlRestTestImplementation(testArtifact(project(xpackModule('core')))) -} - -restResources { - restApi { - include '_common', 'autoscaling' - } -} - -testClusters.configureEach { - testDistribution = 'DEFAULT' - setting 'xpack.security.enabled', 'true' - setting 'xpack.license.self_generated.type', 'trial' - extraConfigFile 'roles.yml', file('autoscaling-roles.yml') - user username: 'autoscaling-admin', password: 'autoscaling-admin-password', role: 'superuser' - user username: 'autoscaling-user', password: 'autoscaling-user-password', role: 'autoscaling' -} diff --git a/x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/java/org/elasticsearch/xpack/autoscaling/AutoscalingRestIT.java b/x-pack/plugin/autoscaling/src/yamlRestTest/java/org/elasticsearch/xpack/autoscaling/AutoscalingRestIT.java similarity index 67% rename from x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/java/org/elasticsearch/xpack/autoscaling/AutoscalingRestIT.java rename to x-pack/plugin/autoscaling/src/yamlRestTest/java/org/elasticsearch/xpack/autoscaling/AutoscalingRestIT.java index 89bc24ecc1ed0..15bef71fe1ea5 100644 --- a/x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/java/org/elasticsearch/xpack/autoscaling/AutoscalingRestIT.java +++ b/x-pack/plugin/autoscaling/src/yamlRestTest/java/org/elasticsearch/xpack/autoscaling/AutoscalingRestIT.java @@ -12,13 +12,24 @@ import org.elasticsearch.common.settings.SecureString; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.ThreadContext; +import org.elasticsearch.test.cluster.ElasticsearchCluster; +import org.elasticsearch.test.cluster.util.resource.Resource; import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate; import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase; - -import static org.elasticsearch.xpack.core.security.authc.support.UsernamePasswordToken.basicAuthHeaderValue; +import org.junit.ClassRule; public class AutoscalingRestIT extends ESClientYamlSuiteTestCase { + @ClassRule + public static ElasticsearchCluster cluster = ElasticsearchCluster.local() + .module("x-pack-autoscaling") + .setting("xpack.security.enabled", "true") + .setting("xpack.license.self_generated.type", "trial") + .rolesFile(Resource.fromClasspath("autoscaling-roles.yml")) + .user("autoscaling-admin", "autoscaling-admin-password", "superuser", false) + .user("autoscaling-user", "autoscaling-user-password", "autoscaling", false) + .build(); + public AutoscalingRestIT(final ClientYamlTestCandidate testCandidate) { super(testCandidate); } @@ -40,4 +51,8 @@ protected Settings restClientSettings() { return Settings.builder().put(ThreadContext.PREFIX + ".Authorization", value).build(); } + @Override + protected String getTestRestCluster() { + return cluster.getHttpAddresses(); + } } diff --git a/x-pack/plugin/autoscaling/qa/rest/autoscaling-roles.yml b/x-pack/plugin/autoscaling/src/yamlRestTest/resources/autoscaling-roles.yml similarity index 100% rename from x-pack/plugin/autoscaling/qa/rest/autoscaling-roles.yml rename to x-pack/plugin/autoscaling/src/yamlRestTest/resources/autoscaling-roles.yml diff --git a/x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/delete_autoscaling_policy.yml b/x-pack/plugin/autoscaling/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/delete_autoscaling_policy.yml similarity index 100% rename from x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/delete_autoscaling_policy.yml rename to x-pack/plugin/autoscaling/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/delete_autoscaling_policy.yml diff --git a/x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_capacity.yml b/x-pack/plugin/autoscaling/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_capacity.yml similarity index 100% rename from x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_capacity.yml rename to x-pack/plugin/autoscaling/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_capacity.yml diff --git a/x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_policy.yml b/x-pack/plugin/autoscaling/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_policy.yml similarity index 100% rename from x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_policy.yml rename to x-pack/plugin/autoscaling/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/get_autoscaling_policy.yml diff --git a/x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/put_autoscaling_policy.yml b/x-pack/plugin/autoscaling/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/put_autoscaling_policy.yml similarity index 100% rename from x-pack/plugin/autoscaling/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/put_autoscaling_policy.yml rename to x-pack/plugin/autoscaling/src/yamlRestTest/resources/rest-api-spec/test/autoscaling/put_autoscaling_policy.yml From 7dd4af3aaf9854d9c5e7cd40c41af4643bafcb9d Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Thu, 17 Jul 2025 15:51:45 +0400 Subject: [PATCH 6/7] Remove 'index' from snapshot clear_cache query params (#131067) It's already part of the path parts, it's not useful to duplicate it in query parameters. --- .../rest-api-spec/api/searchable_snapshots.clear_cache.json | 4 ---- 1 file changed, 4 deletions(-) diff --git a/rest-api-spec/src/main/resources/rest-api-spec/api/searchable_snapshots.clear_cache.json b/rest-api-spec/src/main/resources/rest-api-spec/api/searchable_snapshots.clear_cache.json index d2d7000195c04..8b39cdcb24218 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/api/searchable_snapshots.clear_cache.json +++ b/rest-api-spec/src/main/resources/rest-api-spec/api/searchable_snapshots.clear_cache.json @@ -50,10 +50,6 @@ ], "default": "open", "description": "Whether to expand wildcard expression to concrete indices that are open, closed or both." - }, - "index": { - "type": "list", - "description": "A comma-separated list of index name to limit the operation" } } } From ea0d14b6be0b1afabd41971ae1f6416e683ac045 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 17 Jul 2025 13:14:46 +0100 Subject: [PATCH 7/7] [DiskBBQ] Use PackedLongValues to hold offsets on heap while writing (#131411) --- .../vectors/DefaultIVFVectorsWriter.java | 25 +++++++++++-------- .../index/codec/vectors/IVFVectorsWriter.java | 19 +++++++------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java index 726e84605157a..f34ad071b6ba0 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java @@ -17,8 +17,11 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.LongValues; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.IntToIntFunction; +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.PackedLongValues; import org.elasticsearch.index.codec.vectors.cluster.HierarchicalKMeans; import org.elasticsearch.index.codec.vectors.cluster.KMeansResult; import org.elasticsearch.logging.LogManager; @@ -46,7 +49,7 @@ public DefaultIVFVectorsWriter(SegmentWriteState state, FlatVectorsWriter rawVec } @Override - long[] buildAndWritePostingsLists( + LongValues buildAndWritePostingsLists( FieldInfo fieldInfo, CentroidSupplier centroidSupplier, FloatVectorValues floatVectorValues, @@ -81,7 +84,7 @@ long[] buildAndWritePostingsLists( } } // write the posting lists - final long[] offsets = new long[centroidSupplier.size()]; + final PackedLongValues.Builder offsets = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); DocIdsWriter docIdsWriter = new DocIdsWriter(); DiskBBQBulkWriter bulkWriter = new DiskBBQBulkWriter.OneBitDiskBBQBulkWriter(ES91OSQVectorsScorer.BULK_SIZE, postingsOutput); OnHeapQuantizedVectors onHeapQuantizedVectors = new OnHeapQuantizedVectors( @@ -93,7 +96,7 @@ long[] buildAndWritePostingsLists( float[] centroid = centroidSupplier.centroid(c); int[] cluster = assignmentsByCluster[c]; // TODO align??? - offsets[c] = postingsOutput.getFilePointer(); + offsets.add(postingsOutput.getFilePointer()); int size = cluster.length; postingsOutput.writeVInt(size); postingsOutput.writeInt(Float.floatToIntBits(VectorUtil.dotProduct(centroid, centroid))); @@ -109,11 +112,11 @@ long[] buildAndWritePostingsLists( printClusterQualityStatistics(assignmentsByCluster); } - return offsets; + return offsets.build(); } @Override - long[] buildAndWritePostingsLists( + LongValues buildAndWritePostingsLists( FieldInfo fieldInfo, CentroidSupplier centroidSupplier, FloatVectorValues floatVectorValues, @@ -199,7 +202,7 @@ long[] buildAndWritePostingsLists( } // now we can read the quantized vectors from the temporary file try (IndexInput quantizedVectorsInput = mergeState.segmentInfo.dir.openInput(quantizedVectorsTempName, IOContext.DEFAULT)) { - final long[] offsets = new long[centroidSupplier.size()]; + final PackedLongValues.Builder offsets = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); OffHeapQuantizedVectors offHeapQuantizedVectors = new OffHeapQuantizedVectors( quantizedVectorsInput, fieldInfo.getVectorDimension() @@ -210,9 +213,9 @@ long[] buildAndWritePostingsLists( float[] centroid = centroidSupplier.centroid(c); int[] cluster = assignmentsByCluster[c]; boolean[] isOverspill = isOverspillByCluster[c]; - // TODO align??? - offsets[c] = postingsOutput.getFilePointer(); + offsets.add(postingsOutput.getFilePointer()); int size = cluster.length; + // TODO align??? postingsOutput.writeVInt(size); postingsOutput.writeInt(Float.floatToIntBits(VectorUtil.dotProduct(centroid, centroid))); offHeapQuantizedVectors.reset(size, ord -> isOverspill[ord], ord -> cluster[ord]); @@ -226,7 +229,7 @@ long[] buildAndWritePostingsLists( if (logger.isDebugEnabled()) { printClusterQualityStatistics(assignmentsByCluster); } - return offsets; + return offsets.build(); } } @@ -270,7 +273,7 @@ void writeCentroids( FieldInfo fieldInfo, CentroidSupplier centroidSupplier, float[] globalCentroid, - long[] offsets, + LongValues offsets, IndexOutput centroidOutput ) throws IOException { @@ -302,7 +305,7 @@ void writeCentroids( // write the centroids centroidOutput.writeBytes(buffer.array(), buffer.array().length); // write the offset of this posting list - centroidOutput.writeLong(offsets[i]); + centroidOutput.writeLong(offsets.get(i)); } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsWriter.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsWriter.java index f828c96f7a9e1..149db2eb96b83 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/IVFVectorsWriter.java @@ -28,6 +28,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.LongValues; import org.apache.lucene.util.VectorUtil; import org.elasticsearch.core.IOUtils; import org.elasticsearch.core.SuppressForbidden; @@ -126,11 +127,11 @@ abstract void writeCentroids( FieldInfo fieldInfo, CentroidSupplier centroidSupplier, float[] globalCentroid, - long[] centroidOffset, + LongValues centroidOffset, IndexOutput centroidOutput ) throws IOException; - abstract long[] buildAndWritePostingsLists( + abstract LongValues buildAndWritePostingsLists( FieldInfo fieldInfo, CentroidSupplier centroidSupplier, FloatVectorValues floatVectorValues, @@ -139,7 +140,7 @@ abstract long[] buildAndWritePostingsLists( int[] overspillAssignments ) throws IOException; - abstract long[] buildAndWritePostingsLists( + abstract LongValues buildAndWritePostingsLists( FieldInfo fieldInfo, CentroidSupplier centroidSupplier, FloatVectorValues floatVectorValues, @@ -160,7 +161,7 @@ abstract CentroidSupplier createCentroidSupplier( public final void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { rawVectorDelegate.flush(maxDoc, sortMap); for (FieldWriter fieldWriter : fieldWriters) { - float[] globalCentroid = new float[fieldWriter.fieldInfo.getVectorDimension()]; + final float[] globalCentroid = new float[fieldWriter.fieldInfo.getVectorDimension()]; // build a float vector values with random access final FloatVectorValues floatVectorValues = getFloatVectorValues(fieldWriter.fieldInfo, fieldWriter.delegate, maxDoc); // build centroids @@ -168,7 +169,7 @@ public final void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { // wrap centroids with a supplier final CentroidSupplier centroidSupplier = new OnHeapCentroidSupplier(centroidAssignments.centroids()); // write posting lists - final long[] offsets = buildAndWritePostingsLists( + final LongValues offsets = buildAndWritePostingsLists( fieldWriter.fieldInfo, centroidSupplier, floatVectorValues, @@ -176,9 +177,8 @@ public final void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { centroidAssignments.assignments(), centroidAssignments.overspillAssignments() ); - assert offsets.length == centroidSupplier.size(); - final long centroidOffset = ivfCentroids.alignFilePointer(Float.BYTES); // write centroids + final long centroidOffset = ivfCentroids.alignFilePointer(Float.BYTES); writeCentroids(fieldWriter.fieldInfo, centroidSupplier, globalCentroid, offsets, ivfCentroids); final long centroidLength = ivfCentroids.getFilePointer() - centroidOffset; // write meta file @@ -338,7 +338,7 @@ private void mergeOneFieldIVF(FieldInfo fieldInfo, MergeState mergeState) throws calculatedGlobalCentroid ); // write posting lists - final long[] offsets = buildAndWritePostingsLists( + final LongValues offsets = buildAndWritePostingsLists( fieldInfo, centroidSupplier, floatVectorValues, @@ -347,9 +347,8 @@ private void mergeOneFieldIVF(FieldInfo fieldInfo, MergeState mergeState) throws assignments, overspillAssignments ); - assert offsets.length == centroidSupplier.size(); - centroidOffset = ivfCentroids.alignFilePointer(Float.BYTES); // write centroids + centroidOffset = ivfCentroids.alignFilePointer(Float.BYTES); writeCentroids(fieldInfo, centroidSupplier, calculatedGlobalCentroid, offsets, ivfCentroids); centroidLength = ivfCentroids.getFilePointer() - centroidOffset; // write meta