Skip to content

Commit 1402e68

Browse files
authored
Add support for aggregations, GROK and DISSECT for semantic_text (#117337)
* Add support for aggregations for semantic_text * Add capability to csv tests for grok and dissect * Sort values to avoid flaky tests
1 parent 631345f commit 1402e68

File tree

16 files changed

+131
-24
lines changed

16 files changed

+131
-24
lines changed

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/EsqlSpecTestCase.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,8 @@ public final void test() throws Throwable {
172172
}
173173

174174
protected void shouldSkipTest(String testName) throws IOException {
175-
if (testCase.requiredCapabilities.contains("semantic_text_type")) {
175+
if (testCase.requiredCapabilities.contains("semantic_text_type")
176+
|| testCase.requiredCapabilities.contains("semantic_text_aggregations")) {
176177
assumeTrue("Inference test service needs to be supported for semantic_text", supportsInferenceTestService());
177178
}
178179
checkCapabilities(adminClient(), testFeatureService, testName, testCase);

x-pack/plugin/esql/qa/testFixtures/src/main/resources/mapping-semantic_text.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@
7272
"st_base64": {
7373
"type": "semantic_text",
7474
"inference_id": "test_sparse_inference"
75+
},
76+
"st_logs": {
77+
"type": "semantic_text",
78+
"inference_id": "test_sparse_inference"
7579
}
7680
}
7781
}
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
_id:keyword,semantic_text_field:semantic_text,st_bool:semantic_text,st_cartesian_point:semantic_text,st_cartesian_shape:semantic_text,st_datetime:semantic_text,st_double:semantic_text,st_geopoint:semantic_text,st_geoshape:semantic_text,st_integer:semantic_text,st_ip:semantic_text,st_long:semantic_text,st_unsigned_long:semantic_text,st_version:semantic_text,st_multi_value:semantic_text,st_unicode:semantic_text,host:keyword,description:text,value:long,st_base64:semantic_text
2-
1,live long and prosper,false,"POINT(4297.11 -1475.53)",,1953-09-02T00:00:00.000Z,5.20128E11,"POINT(42.97109630194 14.7552534413725)","POLYGON ((30 10\, 40 40\, 20 40\, 10 20\, 30 10))",23,1.1.1.1,2147483648,2147483648,1.2.3,["Hello there!", "This is a random value", "for testing purposes"],你吃饭了吗,"host1","some description1",1001,ZWxhc3RpYw==
3-
2,all we have to decide is what to do with the time that is given to us,true,"POINT(7580.93 2272.77)",,2023-09-24T15:57:00.000Z,4541.11,"POINT(37.97109630194 21.7552534413725)","POLYGON ((30 10\, 40 40\, 20 40\, 10 20\, 30 10))",122,1.1.2.1,123,2147483648.2,9.0.0,["nice to meet you", "bye bye!"],["谢谢", "对不起我的中文不好"],"host2","some description2",1002,aGVsbG8=
4-
3,be excellent to each other,,,,,,,,,,,,,,,"host3","some description3",1003,
1+
_id:keyword,semantic_text_field:semantic_text,st_bool:semantic_text,st_cartesian_point:semantic_text,st_cartesian_shape:semantic_text,st_datetime:semantic_text,st_double:semantic_text,st_geopoint:semantic_text,st_geoshape:semantic_text,st_integer:semantic_text,st_ip:semantic_text,st_long:semantic_text,st_unsigned_long:semantic_text,st_version:semantic_text,st_multi_value:semantic_text,st_unicode:semantic_text,host:keyword,description:text,value:long,st_base64:semantic_text,st_logs:semantic_text
2+
1,live long and prosper,false,"POINT(4297.11 -1475.53)",,1953-09-02T00:00:00.000Z,5.20128E11,"POINT(42.97109630194 14.7552534413725)","POLYGON ((30 10\, 40 40\, 20 40\, 10 20\, 30 10))",23,1.1.1.1,2147483648,2147483648,1.2.3,["Hello there!", "This is a random value", "for testing purposes"],你吃饭了吗,"host1","some description1",1001,ZWxhc3RpYw==,"2024-12-23T12:15:00.000Z 1.2.3.4 [email protected] 4553"
3+
2,all we have to decide is what to do with the time that is given to us,true,"POINT(7580.93 2272.77)",,2023-09-24T15:57:00.000Z,4541.11,"POINT(37.97109630194 21.7552534413725)","POLYGON ((30 10\, 40 40\, 20 40\, 10 20\, 30 10))",122,1.1.2.1,123,2147483648.2,9.0.0,["nice to meet you", "bye bye!"],["谢谢", "对不起我的中文不好"],"host2","some description2",1002,aGVsbG8=,"2024-01-23T12:15:00.000Z 1.2.3.4 [email protected] 42"
4+
3,be excellent to each other,,,,,,,,,,,,,,,"host3","some description3",1003,,"2023-01-23T12:15:00.000Z 127.0.0.1 [email protected] 42"

x-pack/plugin/esql/qa/testFixtures/src/main/resources/semantic_text.csv-spec

Lines changed: 92 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,19 +88,75 @@ _id:keyword | my_field:semantic_text
8888
3 | be excellent to each other
8989
;
9090

91-
simpleStats
92-
required_capability: semantic_text_type
91+
statsWithCount
92+
required_capability: semantic_text_aggregations
93+
94+
FROM semantic_text METADATA _id
95+
| STATS result = COUNT(st_version)
96+
;
97+
98+
result:long
99+
2
100+
;
101+
102+
statsWithCountDistinct
103+
required_capability: semantic_text_aggregations
104+
105+
FROM semantic_text METADATA _id
106+
| STATS result = COUNT_DISTINCT(st_version)
107+
;
108+
109+
result:long
110+
2
111+
;
112+
113+
statsWithValues
114+
required_capability: semantic_text_aggregations
115+
116+
FROM semantic_text METADATA _id
117+
| STATS result = VALUES(st_version)
118+
| EVAL result = MV_SORT(result)
119+
;
120+
121+
result:keyword
122+
["1.2.3", "9.0.0"]
123+
;
124+
125+
statsWithMin
126+
required_capability: semantic_text_aggregations
127+
128+
FROM semantic_text METADATA _id
129+
| STATS result = min(st_version)
130+
;
131+
132+
result:keyword
133+
1.2.3
134+
;
135+
136+
statsWithMax
137+
required_capability: semantic_text_aggregations
93138

94139
FROM semantic_text METADATA _id
95-
| STATS COUNT(*)
140+
| STATS result = max(st_version)
96141
;
97142

98-
COUNT(*):long
99-
3
143+
result:keyword
144+
9.0.0
145+
;
146+
147+
statsWithTop
148+
required_capability: semantic_text_aggregations
149+
150+
FROM semantic_text METADATA _id
151+
| STATS result = top(st_version, 2, "asc")
152+
;
153+
154+
result:keyword
155+
["1.2.3", "9.0.0"]
100156
;
101157

102158
statsWithGrouping
103-
required_capability: semantic_text_type
159+
required_capability: semantic_text_aggregations
104160

105161
FROM semantic_text METADATA _id
106162
| STATS COUNT(*) BY st_version
@@ -132,6 +188,36 @@ COUNT(*):long | my_field:semantic_text
132188
1 | bye bye!
133189
;
134190

191+
grok
192+
required_capability: semantic_text_type
193+
194+
FROM semantic_text METADATA _id
195+
| GROK st_logs """%{TIMESTAMP_ISO8601:date} %{IP:ip} %{EMAILADDRESS:email} %{NUMBER:num}"""
196+
| KEEP st_logs, date, ip, email, num
197+
| SORT st_logs
198+
;
199+
200+
st_logs:semantic_text | date:keyword | ip:keyword | email:keyword | num:keyword
201+
2023-01-23T12:15:00.000Z 127.0.0.1 [email protected] 42 | 2023-01-23T12:15:00.000Z | 127.0.0.1 | [email protected] | 42
202+
2024-01-23T12:15:00.000Z 1.2.3.4 [email protected] 42 | 2024-01-23T12:15:00.000Z | 1.2.3.4 | [email protected] | 42
203+
2024-12-23T12:15:00.000Z 1.2.3.4 [email protected] 4553 | 2024-12-23T12:15:00.000Z | 1.2.3.4 | [email protected] | 4553
204+
;
205+
206+
dissect
207+
required_capability: semantic_text_type
208+
209+
FROM semantic_text METADATA _id
210+
| DISSECT st_logs """%{date} %{ip} %{email} %{num}"""
211+
| KEEP st_logs, date, ip, email, num
212+
| SORT st_logs
213+
;
214+
215+
st_logs:semantic_text | date:keyword | ip:keyword | email:keyword | num:keyword
216+
2023-01-23T12:15:00.000Z 127.0.0.1 [email protected] 42 | 2023-01-23T12:15:00.000Z | 127.0.0.1 | [email protected] | 42
217+
2024-01-23T12:15:00.000Z 1.2.3.4 [email protected] 42 | 2024-01-23T12:15:00.000Z | 1.2.3.4 | [email protected] | 42
218+
2024-12-23T12:15:00.000Z 1.2.3.4 [email protected] 4553 | 2024-12-23T12:15:00.000Z | 1.2.3.4 | [email protected] | 4553
219+
;
220+
135221
simpleWithLongValue
136222
required_capability: semantic_text_type
137223

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,12 @@ public enum Cap {
526526
/**
527527
* Fix for https://github.com/elastic/elasticsearch/issues/117054
528528
*/
529-
FIX_NESTED_FIELDS_NAME_CLASH_IN_INDEXRESOLVER;
529+
FIX_NESTED_FIELDS_NAME_CLASH_IN_INDEXRESOLVER,
530+
531+
/**
532+
* support for aggregations on semantic_text
533+
*/
534+
SEMANTIC_TEXT_AGGREGATIONS(EsqlCorePlugin.SEMANTIC_TEXT_FEATURE_FLAG);
530535

531536
private final boolean enabled;
532537

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/CountDistinct.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ public class CountDistinct extends AggregateFunction implements OptionalArgument
6666
Map.entry(DataType.KEYWORD, CountDistinctBytesRefAggregatorFunctionSupplier::new),
6767
Map.entry(DataType.IP, CountDistinctBytesRefAggregatorFunctionSupplier::new),
6868
Map.entry(DataType.VERSION, CountDistinctBytesRefAggregatorFunctionSupplier::new),
69-
Map.entry(DataType.TEXT, CountDistinctBytesRefAggregatorFunctionSupplier::new)
69+
Map.entry(DataType.TEXT, CountDistinctBytesRefAggregatorFunctionSupplier::new),
70+
Map.entry(DataType.SEMANTIC_TEXT, CountDistinctBytesRefAggregatorFunctionSupplier::new)
7071
);
7172

7273
private static final int DEFAULT_PRECISION = 3000;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Max.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ public class Max extends AggregateFunction implements ToAggregator, SurrogateExp
5151
Map.entry(DataType.IP, MaxIpAggregatorFunctionSupplier::new),
5252
Map.entry(DataType.KEYWORD, MaxBytesRefAggregatorFunctionSupplier::new),
5353
Map.entry(DataType.TEXT, MaxBytesRefAggregatorFunctionSupplier::new),
54+
Map.entry(DataType.SEMANTIC_TEXT, MaxBytesRefAggregatorFunctionSupplier::new),
5455
Map.entry(DataType.VERSION, MaxBytesRefAggregatorFunctionSupplier::new)
5556
);
5657

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Min.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ public class Min extends AggregateFunction implements ToAggregator, SurrogateExp
5151
Map.entry(DataType.IP, MinIpAggregatorFunctionSupplier::new),
5252
Map.entry(DataType.VERSION, MinBytesRefAggregatorFunctionSupplier::new),
5353
Map.entry(DataType.KEYWORD, MinBytesRefAggregatorFunctionSupplier::new),
54-
Map.entry(DataType.TEXT, MinBytesRefAggregatorFunctionSupplier::new)
54+
Map.entry(DataType.TEXT, MinBytesRefAggregatorFunctionSupplier::new),
55+
Map.entry(DataType.SEMANTIC_TEXT, MinBytesRefAggregatorFunctionSupplier::new)
5556
);
5657

5758
@FunctionInfo(

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Values.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ public class Values extends AggregateFunction implements ToAggregator {
4646
Map.entry(DataType.DOUBLE, ValuesDoubleAggregatorFunctionSupplier::new),
4747
Map.entry(DataType.KEYWORD, ValuesBytesRefAggregatorFunctionSupplier::new),
4848
Map.entry(DataType.TEXT, ValuesBytesRefAggregatorFunctionSupplier::new),
49+
Map.entry(DataType.SEMANTIC_TEXT, ValuesBytesRefAggregatorFunctionSupplier::new),
4950
Map.entry(DataType.IP, ValuesBytesRefAggregatorFunctionSupplier::new),
5051
Map.entry(DataType.VERSION, ValuesBytesRefAggregatorFunctionSupplier::new),
5152
Map.entry(DataType.BOOLEAN, ValuesBooleanAggregatorFunctionSupplier::new)

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/planner/AggregateMapper.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -302,12 +302,13 @@ private static String dataTypeToString(DataType type, Class<?> aggClass) {
302302
case DataType.INTEGER, DataType.COUNTER_INTEGER -> "Int";
303303
case DataType.LONG, DataType.DATETIME, DataType.COUNTER_LONG, DataType.DATE_NANOS -> "Long";
304304
case DataType.DOUBLE, DataType.COUNTER_DOUBLE -> "Double";
305-
case DataType.KEYWORD, DataType.IP, DataType.VERSION, DataType.TEXT -> "BytesRef";
305+
case DataType.KEYWORD, DataType.IP, DataType.VERSION, DataType.TEXT, DataType.SEMANTIC_TEXT -> "BytesRef";
306306
case GEO_POINT -> "GeoPoint";
307307
case CARTESIAN_POINT -> "CartesianPoint";
308-
case SEMANTIC_TEXT, UNSUPPORTED, NULL, UNSIGNED_LONG, SHORT, BYTE, FLOAT, HALF_FLOAT, SCALED_FLOAT, OBJECT, SOURCE, DATE_PERIOD,
309-
TIME_DURATION, CARTESIAN_SHAPE, GEO_SHAPE, DOC_DATA_TYPE, TSID_DATA_TYPE, PARTIAL_AGG ->
310-
throw new EsqlIllegalArgumentException("illegal agg type: " + type.typeName());
308+
case UNSUPPORTED, NULL, UNSIGNED_LONG, SHORT, BYTE, FLOAT, HALF_FLOAT, SCALED_FLOAT, OBJECT, SOURCE, DATE_PERIOD, TIME_DURATION,
309+
CARTESIAN_SHAPE, GEO_SHAPE, DOC_DATA_TYPE, TSID_DATA_TYPE, PARTIAL_AGG -> throw new EsqlIllegalArgumentException(
310+
"illegal agg type: " + type.typeName()
311+
);
311312
};
312313
}
313314
}

0 commit comments

Comments
 (0)