ESQL: Support union types/multi-index CSV tests (#119273)

GalLalouche · web-flow · commit 9452a8582bcb · 2025-01-03T17:10:42.000Z
ESQL: adds support for multi-index/union types in CSV tests. Previously, these were only testable using integration tests, which are slower and more annoying to debug.
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestUtils.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestUtils.java
@@ -551,11 +551,11 @@ public static Type asType(ElementType elementType, Type actualType) {
         }
 
         private static Type bytesRefBlockType(Type actualType) {
-            if (actualType == GEO_POINT || actualType == CARTESIAN_POINT || actualType == GEO_SHAPE || actualType == CARTESIAN_SHAPE) {
-                return actualType;
-            } else {
-                return KEYWORD;
-            }
+            return switch (actualType) {
+                case NULL -> NULL;
+                case GEO_POINT, CARTESIAN_POINT, GEO_SHAPE, CARTESIAN_SHAPE -> actualType;
+                default -> KEYWORD;
+            };
         }
 
         Object convert(String value) {
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java b/x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java
@@ -26,6 +26,7 @@
 import org.elasticsearch.common.logging.LogConfigurator;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.XContentHelper;
+import org.elasticsearch.core.Nullable;
 import org.elasticsearch.logging.LogManager;
 import org.elasticsearch.logging.Logger;
 import org.elasticsearch.test.rest.ESRestTestCase;
@@ -603,13 +604,20 @@ private static void forceMerge(RestClient client, Set<String> indices, Logger lo
         }
     }
 
+    public record MultiIndexTestDataset(String indexPattern, List<TestsDataset> datasets) {
+        public static MultiIndexTestDataset of(TestsDataset testsDataset) {
+            return new MultiIndexTestDataset(testsDataset.indexName, List.of(testsDataset));
+        }
+
+    }
+
     public record TestsDataset(
         String indexName,
         String mappingFileName,
         String dataFileName,
         String settingFileName,
         boolean allowSubFields,
-        Map<String, String> typeMapping,
+        @Nullable Map<String, String> typeMapping,
         boolean requiresInferenceEndpoint
     ) {
         public TestsDataset(String indexName, String mappingFileName, String dataFileName) {
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/union_types.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/union_types.csv-spec
@@ -445,6 +445,7 @@ count:long  |  message:keyword
 
 multiIndexMissingIpToString
 required_capability: union_types
+required_capability: metadata_fields
 required_capability: union_types_missing_field
 
 FROM sample_data, sample_data_str, missing_ip_sample_data METADATA _index
@@ -479,6 +480,7 @@ sample_data_str        | 2023-10-23T12:15:03.360Z  |  172.21.2.162       |  3450
 
 multiIndexMissingIpToIp
 required_capability: union_types
+required_capability: metadata_fields
 required_capability: union_types_missing_field
 
 FROM sample_data, sample_data_str, missing_ip_sample_data METADATA _index
@@ -1373,9 +1375,6 @@ client_ip:ip | event_duration:long |    message:keyword    |    @timestamp:keywo
 # Once INLINESTATS supports expressions in agg functions and groups, convert the group in the inlinestats
 
 multiIndexIndirectUseOfUnionTypesInSort
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 FROM sample_data, sample_data_ts_long
 | SORT client_ip ASC
 | LIMIT 1
@@ -1386,8 +1385,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInEval
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
 required_capability: union_types
 FROM sample_data, sample_data_ts_long
 | EVAL foo = event_duration > 1232381
@@ -1400,9 +1397,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInRename
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 required_capability: union_types_fix_rename_resolution
 FROM sample_data, sample_data_ts_long
 | RENAME message AS event_message
@@ -1415,9 +1409,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInKeep
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 FROM sample_data, sample_data_ts_long
 | KEEP client_ip, event_duration, message
 | SORT client_ip ASC
@@ -1429,9 +1420,6 @@ client_ip:ip | event_duration:long | message:keyword
 ;
 
 multiIndexIndirectUseOfUnionTypesInWildcardKeep
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 required_capability: union_types_fix_rename_resolution
 FROM sample_data, sample_data_ts_long
 | KEEP *
@@ -1444,9 +1432,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInWildcardKeep2
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 required_capability: union_types_fix_rename_resolution
 FROM sample_data, sample_data_ts_long
 | KEEP *e*
@@ -1460,9 +1445,6 @@ FROM sample_data, sample_data_ts_long
 
 
 multiIndexUseOfUnionTypesInKeep
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 required_capability: union_types_fix_rename_resolution
 FROM sample_data, sample_data_ts_long
 | KEEP @timestamp
@@ -1474,9 +1456,6 @@ null
 ;
 
 multiIndexUseOfUnionTypesInDrop
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 required_capability: union_types_fix_rename_resolution
 FROM sample_data, sample_data_ts_long
 | DROP @timestamp
@@ -1489,9 +1468,6 @@ client_ip:ip | event_duration:long | message:keyword
 ;
 
 multiIndexIndirectUseOfUnionTypesInWildcardDrop
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 required_capability: union_types_fix_rename_resolution
 FROM sample_data, sample_data_ts_long
 | DROP *time*
@@ -1504,9 +1480,6 @@ client_ip:ip | event_duration:long | message:keyword
 ;
 
 multiIndexIndirectUseOfUnionTypesInWhere
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 FROM sample_data, sample_data_ts_long
 | WHERE message == "Disconnected"
 ;
@@ -1517,9 +1490,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInDissect
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 FROM sample_data, sample_data_ts_long
 | DISSECT message "%{foo}"
 | SORT client_ip ASC
@@ -1531,9 +1501,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInGrok
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 FROM sample_data, sample_data_ts_long
 | GROK message "%{WORD:foo}"
 | SORT client_ip ASC
@@ -1545,9 +1512,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInEnrich
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 required_capability: enrich_load
 FROM sample_data, sample_data_ts_long
 | EVAL client_ip = client_ip::keyword
@@ -1561,9 +1525,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInStats
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 FROM sample_data, sample_data_ts_long
 | STATS foo = max(event_duration) BY client_ip
 | SORT client_ip ASC
@@ -1577,9 +1538,6 @@ foo:long | client_ip:ip
 ;
 
 multiIndexIndirectUseOfUnionTypesInInlineStats-Ignore
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 required_capability: inlinestats
 FROM sample_data, sample_data_ts_long
 | INLINESTATS foo = max(event_duration)
@@ -1592,9 +1550,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInLookup-Ignore
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 required_capability: lookup_v4
 FROM sample_data, sample_data_ts_long
 | SORT client_ip ASC
@@ -1608,9 +1563,6 @@ FROM sample_data, sample_data_ts_long
 ;
 
 multiIndexIndirectUseOfUnionTypesInMvExpand
-// TODO: `union_types` is required only because this makes the test skip in the csv tests; better solution:
-// make the csv tests work with multiple indices.
-required_capability: union_types
 FROM sample_data, sample_data_ts_long
 | EVAL foo = MV_APPEND(message, message)
 | SORT client_ip ASC
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/EsPhysicalOperationProviders.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/EsPhysicalOperationProviders.java
@@ -13,22 +13,16 @@
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
-import org.elasticsearch.common.breaker.CircuitBreaker;
-import org.elasticsearch.common.breaker.NoopCircuitBreaker;
 import org.elasticsearch.common.logging.HeaderWarning;
-import org.elasticsearch.common.util.BigArrays;
 import org.elasticsearch.compute.aggregation.GroupingAggregator;
 import org.elasticsearch.compute.data.Block;
 import org.elasticsearch.compute.data.ElementType;
-import org.elasticsearch.compute.data.Page;
 import org.elasticsearch.compute.lucene.LuceneCountOperator;
 import org.elasticsearch.compute.lucene.LuceneOperator;
 import org.elasticsearch.compute.lucene.LuceneSourceOperator;
 import org.elasticsearch.compute.lucene.LuceneTopNSourceOperator;
 import org.elasticsearch.compute.lucene.TimeSeriesSortedSourceOperatorFactory;
 import org.elasticsearch.compute.lucene.ValuesSourceReaderOperator;
-import org.elasticsearch.compute.operator.DriverContext;
-import org.elasticsearch.compute.operator.EvalOperator;
 import org.elasticsearch.compute.operator.Operator;
 import org.elasticsearch.compute.operator.OrdinalsGroupingOperator;
 import org.elasticsearch.compute.operator.SourceOperator;
@@ -380,29 +374,13 @@ public FieldNamesFieldMapper.FieldNamesFieldType fieldNames() {
         }
     }
 
-    static class TypeConvertingBlockLoader implements BlockLoader {
-        protected final BlockLoader delegate;
-        private final EvalOperator.ExpressionEvaluator convertEvaluator;
+    private static class TypeConvertingBlockLoader implements BlockLoader {
+        private final BlockLoader delegate;
+        private final TypeConverter typeConverter;
 
         protected TypeConvertingBlockLoader(BlockLoader delegate, AbstractConvertFunction convertFunction) {
             this.delegate = delegate;
-            DriverContext driverContext1 = new DriverContext(
-                BigArrays.NON_RECYCLING_INSTANCE,
-                new org.elasticsearch.compute.data.BlockFactory(
-                    new NoopCircuitBreaker(CircuitBreaker.REQUEST),
-                    BigArrays.NON_RECYCLING_INSTANCE
-                )
-            );
-            this.convertEvaluator = convertFunction.toEvaluator(e -> driverContext -> new EvalOperator.ExpressionEvaluator() {
-                @Override
-                public org.elasticsearch.compute.data.Block eval(Page page) {
-                    // This is a pass-through evaluator, since it sits directly on the source loading (no prior expressions)
-                    return page.getBlock(0);
-                }
-
-                @Override
-                public void close() {}
-            }).get(driverContext1);
+            this.typeConverter = TypeConverter.fromConvertFunction(convertFunction);
         }
 
         @Override
@@ -413,8 +391,7 @@ public Builder builder(BlockFactory factory, int expectedCount) {
 
         @Override
         public Block convert(Block block) {
-            Page page = new Page((org.elasticsearch.compute.data.Block) block);
-            return convertEvaluator.eval(page);
+            return typeConverter.convert((org.elasticsearch.compute.data.Block) block);
         }
 
         @Override
@@ -427,9 +404,7 @@ public ColumnAtATimeReader columnAtATimeReader(LeafReaderContext context) throws
                 @Override
                 public Block read(BlockFactory factory, Docs docs) throws IOException {
                     Block block = reader.read(factory, docs);
-                    Page page = new Page((org.elasticsearch.compute.data.Block) block);
-                    org.elasticsearch.compute.data.Block converted = convertEvaluator.eval(page);
-                    return converted;
+                    return typeConverter.convert((org.elasticsearch.compute.data.Block) block);
                 }
 
                 @Override
@@ -469,7 +444,7 @@ public SortedSetDocValues ordinals(LeafReaderContext context) {
 
         @Override
         public final String toString() {
-            return "TypeConvertingBlockLoader[delegate=" + delegate + ", convertEvaluator=" + convertEvaluator + "]";
+            return "TypeConvertingBlockLoader[delegate=" + delegate + ", typeConverter=" + typeConverter + "]";
         }
     }
 }
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/TypeConverter.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/TypeConverter.java
@@ -0,0 +1,59 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.esql.planner;
+
+import org.elasticsearch.common.breaker.CircuitBreaker;
+import org.elasticsearch.common.breaker.NoopCircuitBreaker;
+import org.elasticsearch.common.util.BigArrays;
+import org.elasticsearch.compute.data.Block;
+import org.elasticsearch.compute.data.Page;
+import org.elasticsearch.compute.operator.DriverContext;
+import org.elasticsearch.compute.operator.EvalOperator.ExpressionEvaluator;
+import org.elasticsearch.xpack.esql.expression.function.scalar.convert.AbstractConvertFunction;
+
+class TypeConverter {
+    private final String evaluatorName;
+    private final ExpressionEvaluator convertEvaluator;
+
+    private TypeConverter(String evaluatorName, ExpressionEvaluator convertEvaluator) {
+        this.evaluatorName = evaluatorName;
+        this.convertEvaluator = convertEvaluator;
+    }
+
+    public static TypeConverter fromConvertFunction(AbstractConvertFunction convertFunction) {
+        DriverContext driverContext1 = new DriverContext(
+            BigArrays.NON_RECYCLING_INSTANCE,
+            new org.elasticsearch.compute.data.BlockFactory(
+                new NoopCircuitBreaker(CircuitBreaker.REQUEST),
+                BigArrays.NON_RECYCLING_INSTANCE
+            )
+        );
+        return new TypeConverter(
+            convertFunction.functionName(),
+            convertFunction.toEvaluator(e -> driverContext -> new ExpressionEvaluator() {
+                @Override
+                public org.elasticsearch.compute.data.Block eval(Page page) {
+                    // This is a pass-through evaluator, since it sits directly on the source loading (no prior expressions)
+                    return page.getBlock(0);
+                }
+
+                @Override
+                public void close() {}
+            }).get(driverContext1)
+        );
+    }
+
+    public Block convert(Block block) {
+        return convertEvaluator.eval(new Page(block));
+    }
+
+    @Override
+    public String toString() {
+        return evaluatorName;
+    }
+}
diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java
diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/planner/TestPhysicalOperationProviders.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/planner/TestPhysicalOperationProviders.java