diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
index 0337000a201f2..04c8bb0aa037c 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java
@@ -210,6 +210,22 @@ public static CategorizationAnalyzerConfig buildStandardCategorizationAnalyzer(L
.build();
}
+ /**
+ * Create a categorization_analyzer that will be used by the ES|QL categorize function.
+ * The only difference from the DSL analyzer is the tokenizer (standard instead of ml_standard).
+ * This means the results are slightly different from the categorize text aggregation and the ML job,
+ * however you can use these tokens for looking up messages in indices generated with the standard
+ * tokenizer. The latter is considered more important.
+ */
+ public static CategorizationAnalyzerConfig buildStandardEsqlCategorizationAnalyzer() {
+
+ return new CategorizationAnalyzerConfig.Builder().addCharFilter("first_line_with_letters")
+ .setTokenizer("standard")
+ .addDateWordsTokenFilter()
+ .addLimitFilter()
+ .build();
+ }
+
private final String analyzer;
private final List charFilters;
private final NameOrDefinition tokenizer;
diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java
index f83776fbdbc85..5e716d8c9d5ff 100644
--- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java
+++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java
@@ -39,7 +39,6 @@
import java.io.IOException;
import java.util.HashMap;
-import java.util.List;
import java.util.Map;
import java.util.Objects;
@@ -48,9 +47,8 @@
*/
public class CategorizeBlockHash extends BlockHash {
- private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig.buildStandardCategorizationAnalyzer(
- List.of()
- );
+ private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig
+ .buildStandardEsqlCategorizationAnalyzer();
private static final int NULL_ORD = 0;
private final int channel;
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec
index aef412975fa89..b4526d89ffb0f 100644
--- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec
+++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec
@@ -1,5 +1,5 @@
standard aggs
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS count=COUNT(),
@@ -17,7 +17,7 @@ count:long | sum:long | avg:double | count_distinct:long | category:keyw
;
values aggs
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS values=MV_SORT(VALUES(message)),
@@ -33,7 +33,7 @@ values:keyword | top
;
mv
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM mv_sample_data
| STATS COUNT(), SUM(event_duration) BY category=CATEGORIZE(message)
@@ -48,7 +48,7 @@ COUNT():long | SUM(event_duration):long | category:keyword
;
row mv
-required_capability: categorize_v5
+required_capability: categorize_v6
ROW message = ["connected to a", "connected to b", "disconnected"], str = ["a", "b", "c"]
| STATS COUNT(), VALUES(str) BY category=CATEGORIZE(message)
@@ -61,7 +61,7 @@ COUNT():long | VALUES(str):keyword | category:keyword
;
limit before stats
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data | SORT message | LIMIT 4
| STATS count=COUNT() BY category=CATEGORIZE(message)
@@ -74,7 +74,7 @@ count:long | category:keyword
;
skips stopwords
-required_capability: categorize_v5
+required_capability: categorize_v6
ROW message = ["Mon Tue connected to a", "Jul Aug connected to b September ", "UTC connected GMT to c UTC"]
| STATS COUNT() BY category=CATEGORIZE(message)
@@ -86,7 +86,7 @@ COUNT():long | category:keyword
;
with multiple indices
-required_capability: categorize_v5
+required_capability: categorize_v6
required_capability: union_types
FROM sample_data*
@@ -101,7 +101,7 @@ COUNT():long | category:keyword
;
mv with many values
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM employees
| STATS COUNT() BY category=CATEGORIZE(job_positions)
@@ -118,7 +118,7 @@ COUNT():long | category:keyword
;
mv with many values and SUM
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM employees
| STATS SUM(languages) BY category=CATEGORIZE(job_positions)
@@ -133,7 +133,7 @@ SUM(languages):long | category:keyword
;
mv with many values and nulls and SUM
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM employees
| STATS SUM(languages) BY category=CATEGORIZE(job_positions)
@@ -147,7 +147,7 @@ SUM(languages):long | category:keyword
;
mv via eval
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| EVAL message = MV_APPEND(message, "Banana")
@@ -163,7 +163,7 @@ COUNT():long | category:keyword
;
mv via eval const
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| EVAL message = ["Banana", "Bread"]
@@ -177,7 +177,7 @@ COUNT():long | category:keyword
;
mv via eval const without aliases
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| EVAL message = ["Banana", "Bread"]
@@ -191,7 +191,7 @@ COUNT():long | CATEGORIZE(message):keyword
;
mv const in parameter
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT() BY c = CATEGORIZE(["Banana", "Bread"])
@@ -204,7 +204,7 @@ COUNT():long | c:keyword
;
agg alias shadowing
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS c = COUNT() BY c = CATEGORIZE(["Banana", "Bread"])
@@ -219,7 +219,7 @@ c:keyword
;
chained aggregations using categorize
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT() BY category=CATEGORIZE(message)
@@ -234,7 +234,7 @@ COUNT():long | category:keyword
;
stats without aggs
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS BY category=CATEGORIZE(message)
@@ -248,7 +248,7 @@ category:keyword
;
text field
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM hosts
| STATS COUNT() BY category=CATEGORIZE(host_group)
@@ -256,9 +256,9 @@ FROM hosts
;
COUNT():long | category:keyword
+ 2 | .*?DB.+?servers.*?
2 | .*?Gateway.+?instances.*?
5 | .*?Kubernetes.+?cluster.*?
- 2 | .*?servers.*?
1 | null
// Note: DB is removed from "DB servers", because the ml_standard
@@ -266,7 +266,7 @@ COUNT():long | category:keyword
;
on TO_UPPER
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT() BY category=CATEGORIZE(TO_UPPER(message))
@@ -280,7 +280,7 @@ COUNT():long | category:keyword
;
on CONCAT
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " banana"))
@@ -294,7 +294,7 @@ COUNT():long | category:keyword
;
on CONCAT with unicode
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " 👍🏽😊"))
@@ -302,13 +302,13 @@ FROM sample_data
;
COUNT():long | category:keyword
- 3 | .*?Connected.+?to.*?
- 3 | .*?Connection.+?error.*?
- 1 | .*?Disconnected.*?
+3 | .*?Connected.+?to.+?👍🏽.+?😊.*?
+3 | .*?Connection.+?error.+?👍🏽.+?😊.*?
+1 | .*?Disconnected.+?👍🏽.+?😊.*?
;
on REVERSE(CONCAT())
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT() BY category=CATEGORIZE(REVERSE(CONCAT(message, " 👍🏽😊")))
@@ -316,13 +316,13 @@ FROM sample_data
;
COUNT():long | category:keyword
- 1 | .*?detcennocsiD.*?
- 3 | .*?ot.+?detcennoC.*?
- 3 | .*?rorre.+?noitcennoC.*?
+1 | .*?😊.+?👍🏽.+?detcennocsiD.*?
+3 | .*?😊.+?👍🏽.+?ot.+?detcennoC.*?
+3 | .*?😊.+?👍🏽.+?rorre.+?noitcennoC.*?
;
and then TO_LOWER
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT() BY category=CATEGORIZE(message)
@@ -337,7 +337,7 @@ COUNT():long | category:keyword
;
on const empty string
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT() BY category=CATEGORIZE("")
@@ -349,7 +349,7 @@ COUNT():long | category:keyword
;
on const empty string from eval
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| EVAL x = ""
@@ -362,7 +362,7 @@ COUNT():long | category:keyword
;
on null
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| EVAL x = null
@@ -375,7 +375,7 @@ COUNT():long | SUM(event_duration):long | category:keyword
;
on null string
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| EVAL x = null::string
@@ -388,7 +388,7 @@ COUNT():long | category:keyword
;
on const null
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT(), SUM(event_duration) BY category=CATEGORIZE(null)
@@ -400,7 +400,7 @@ COUNT():long | SUM(event_duration):long | category:keyword
;
on null row
-required_capability: categorize_v5
+required_capability: categorize_v6
ROW message = null, str = ["a", "b", "c"]
| STATS COUNT(), VALUES(str) BY category=CATEGORIZE(message)
@@ -411,7 +411,7 @@ COUNT():long | VALUES(str):keyword | category:keyword
;
filtering out all data
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| WHERE @timestamp < "2023-10-23T00:00:00Z"
@@ -423,7 +423,7 @@ COUNT():long | category:keyword
;
filtering out all data with constant
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT() BY category=CATEGORIZE(message)
@@ -434,7 +434,7 @@ COUNT():long | category:keyword
;
drop output columns
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS count=COUNT() BY category=CATEGORIZE(message)
@@ -449,7 +449,7 @@ x:integer
;
category value processing
-required_capability: categorize_v5
+required_capability: categorize_v6
ROW message = ["connected to a", "connected to b", "disconnected"]
| STATS COUNT() BY category=CATEGORIZE(message)
@@ -463,7 +463,7 @@ COUNT():long | category:keyword
;
row aliases
-required_capability: categorize_v5
+required_capability: categorize_v6
ROW message = "connected to xyz"
| EVAL x = message
@@ -477,7 +477,7 @@ COUNT():long | category:keyword | y:keyword
;
from aliases
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| EVAL x = message
@@ -493,7 +493,7 @@ COUNT():long | category:keyword | y:keyword
;
row aliases with keep
-required_capability: categorize_v5
+required_capability: categorize_v6
ROW message = "connected to xyz"
| EVAL x = message
@@ -509,7 +509,7 @@ COUNT():long | y:keyword
;
from aliases with keep
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| EVAL x = message
@@ -527,7 +527,7 @@ COUNT():long | y:keyword
;
row rename
-required_capability: categorize_v5
+required_capability: categorize_v6
ROW message = "connected to xyz"
| RENAME message as x
@@ -541,7 +541,7 @@ COUNT():long | y:keyword
;
from rename
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| RENAME message as x
@@ -557,7 +557,7 @@ COUNT():long | y:keyword
;
row drop
-required_capability: categorize_v5
+required_capability: categorize_v6
ROW message = "connected to a"
| STATS c = COUNT() BY category=CATEGORIZE(message)
@@ -570,7 +570,7 @@ c:long
;
from drop
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS c = COUNT() BY category=CATEGORIZE(message)
@@ -585,7 +585,7 @@ c:long
;
reuse categorize arg expression in agg
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS m = MAX(LENGTH(CONCAT(message, "_end"))) BY c = CATEGORIZE(CONCAT(message, "_end"))
@@ -600,7 +600,7 @@ m:integer |c:keyword
categorize in aggs inside function
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT(), x = MV_APPEND(category, category) BY category=CATEGORIZE(message)
@@ -615,7 +615,7 @@ COUNT():long | x:keyword
;
categorize in aggs same as grouping inside function
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT(), x = MV_APPEND(CATEGORIZE(message), `CATEGORIZE(message)`) BY CATEGORIZE(message)
@@ -630,7 +630,7 @@ COUNT():long | x:keyword
;
categorize in aggs same as grouping inside function with explicit alias
-required_capability: categorize_v5
+required_capability: categorize_v6
FROM sample_data
| STATS COUNT(), x = MV_APPEND(CATEGORIZE(message), category) BY category=CATEGORIZE(message)
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/docs.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/docs.csv-spec
index a6bb68de16b78..1d2fade118e35 100644
--- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/docs.csv-spec
+++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/docs.csv-spec
@@ -690,7 +690,7 @@ Bangalore | 9 | 72
;
docsCategorize
-required_capability: categorize_v5
+required_capability: categorize_v6
// tag::docsCategorize[]
FROM sample_data
| STATS count=COUNT() BY category=CATEGORIZE(message)
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
index c298280dfa532..54487448a1d7c 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
@@ -665,7 +665,7 @@ public enum Cap {
/**
* Supported the text categorization function "CATEGORIZE".
*/
- CATEGORIZE_V5,
+ CATEGORIZE_V6,
/**
* Support for multiple groupings in "CATEGORIZE".