diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java index 0337000a201f2..04c8bb0aa037c 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java @@ -210,6 +210,22 @@ public static CategorizationAnalyzerConfig buildStandardCategorizationAnalyzer(L .build(); } + /** + * Create a categorization_analyzer that will be used by the ES|QL categorize function. + * The only difference from the DSL analyzer is the tokenizer (standard instead of ml_standard). + * This means the results are slightly different from the categorize text aggregation and the ML job, + * however you can use these tokens for looking up messages in indices generated with the standard + * tokenizer. The latter is considered more important. + */ + public static CategorizationAnalyzerConfig buildStandardEsqlCategorizationAnalyzer() { + + return new CategorizationAnalyzerConfig.Builder().addCharFilter("first_line_with_letters") + .setTokenizer("standard") + .addDateWordsTokenFilter() + .addLimitFilter() + .build(); + } + private final String analyzer; private final List charFilters; private final NameOrDefinition tokenizer; diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java index f83776fbdbc85..5e716d8c9d5ff 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java @@ -39,7 +39,6 @@ import java.io.IOException; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Objects; @@ -48,9 +47,8 @@ */ public class CategorizeBlockHash extends BlockHash { - private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig.buildStandardCategorizationAnalyzer( - List.of() - ); + private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig + .buildStandardEsqlCategorizationAnalyzer(); private static final int NULL_ORD = 0; private final int channel; diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec index aef412975fa89..b4526d89ffb0f 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec @@ -1,5 +1,5 @@ standard aggs -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS count=COUNT(), @@ -17,7 +17,7 @@ count:long | sum:long | avg:double | count_distinct:long | category:keyw ; values aggs -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS values=MV_SORT(VALUES(message)), @@ -33,7 +33,7 @@ values:keyword | top ; mv -required_capability: categorize_v5 +required_capability: categorize_v6 FROM mv_sample_data | STATS COUNT(), SUM(event_duration) BY category=CATEGORIZE(message) @@ -48,7 +48,7 @@ COUNT():long | SUM(event_duration):long | category:keyword ; row mv -required_capability: categorize_v5 +required_capability: categorize_v6 ROW message = ["connected to a", "connected to b", "disconnected"], str = ["a", "b", "c"] | STATS COUNT(), VALUES(str) BY category=CATEGORIZE(message) @@ -61,7 +61,7 @@ COUNT():long | VALUES(str):keyword | category:keyword ; limit before stats -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | SORT message | LIMIT 4 | STATS count=COUNT() BY category=CATEGORIZE(message) @@ -74,7 +74,7 @@ count:long | category:keyword ; skips stopwords -required_capability: categorize_v5 +required_capability: categorize_v6 ROW message = ["Mon Tue connected to a", "Jul Aug connected to b September ", "UTC connected GMT to c UTC"] | STATS COUNT() BY category=CATEGORIZE(message) @@ -86,7 +86,7 @@ COUNT():long | category:keyword ; with multiple indices -required_capability: categorize_v5 +required_capability: categorize_v6 required_capability: union_types FROM sample_data* @@ -101,7 +101,7 @@ COUNT():long | category:keyword ; mv with many values -required_capability: categorize_v5 +required_capability: categorize_v6 FROM employees | STATS COUNT() BY category=CATEGORIZE(job_positions) @@ -118,7 +118,7 @@ COUNT():long | category:keyword ; mv with many values and SUM -required_capability: categorize_v5 +required_capability: categorize_v6 FROM employees | STATS SUM(languages) BY category=CATEGORIZE(job_positions) @@ -133,7 +133,7 @@ SUM(languages):long | category:keyword ; mv with many values and nulls and SUM -required_capability: categorize_v5 +required_capability: categorize_v6 FROM employees | STATS SUM(languages) BY category=CATEGORIZE(job_positions) @@ -147,7 +147,7 @@ SUM(languages):long | category:keyword ; mv via eval -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | EVAL message = MV_APPEND(message, "Banana") @@ -163,7 +163,7 @@ COUNT():long | category:keyword ; mv via eval const -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | EVAL message = ["Banana", "Bread"] @@ -177,7 +177,7 @@ COUNT():long | category:keyword ; mv via eval const without aliases -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | EVAL message = ["Banana", "Bread"] @@ -191,7 +191,7 @@ COUNT():long | CATEGORIZE(message):keyword ; mv const in parameter -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT() BY c = CATEGORIZE(["Banana", "Bread"]) @@ -204,7 +204,7 @@ COUNT():long | c:keyword ; agg alias shadowing -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS c = COUNT() BY c = CATEGORIZE(["Banana", "Bread"]) @@ -219,7 +219,7 @@ c:keyword ; chained aggregations using categorize -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(message) @@ -234,7 +234,7 @@ COUNT():long | category:keyword ; stats without aggs -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS BY category=CATEGORIZE(message) @@ -248,7 +248,7 @@ category:keyword ; text field -required_capability: categorize_v5 +required_capability: categorize_v6 FROM hosts | STATS COUNT() BY category=CATEGORIZE(host_group) @@ -256,9 +256,9 @@ FROM hosts ; COUNT():long | category:keyword + 2 | .*?DB.+?servers.*? 2 | .*?Gateway.+?instances.*? 5 | .*?Kubernetes.+?cluster.*? - 2 | .*?servers.*? 1 | null // Note: DB is removed from "DB servers", because the ml_standard @@ -266,7 +266,7 @@ COUNT():long | category:keyword ; on TO_UPPER -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(TO_UPPER(message)) @@ -280,7 +280,7 @@ COUNT():long | category:keyword ; on CONCAT -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " banana")) @@ -294,7 +294,7 @@ COUNT():long | category:keyword ; on CONCAT with unicode -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(CONCAT(message, " 👍🏽😊")) @@ -302,13 +302,13 @@ FROM sample_data ; COUNT():long | category:keyword - 3 | .*?Connected.+?to.*? - 3 | .*?Connection.+?error.*? - 1 | .*?Disconnected.*? +3 | .*?Connected.+?to.+?👍🏽.+?😊.*? +3 | .*?Connection.+?error.+?👍🏽.+?😊.*? +1 | .*?Disconnected.+?👍🏽.+?😊.*? ; on REVERSE(CONCAT()) -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(REVERSE(CONCAT(message, " 👍🏽😊"))) @@ -316,13 +316,13 @@ FROM sample_data ; COUNT():long | category:keyword - 1 | .*?detcennocsiD.*? - 3 | .*?ot.+?detcennoC.*? - 3 | .*?rorre.+?noitcennoC.*? +1 | .*?😊.+?👍🏽.+?detcennocsiD.*? +3 | .*?😊.+?👍🏽.+?ot.+?detcennoC.*? +3 | .*?😊.+?👍🏽.+?rorre.+?noitcennoC.*? ; and then TO_LOWER -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(message) @@ -337,7 +337,7 @@ COUNT():long | category:keyword ; on const empty string -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT() BY category=CATEGORIZE("") @@ -349,7 +349,7 @@ COUNT():long | category:keyword ; on const empty string from eval -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | EVAL x = "" @@ -362,7 +362,7 @@ COUNT():long | category:keyword ; on null -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | EVAL x = null @@ -375,7 +375,7 @@ COUNT():long | SUM(event_duration):long | category:keyword ; on null string -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | EVAL x = null::string @@ -388,7 +388,7 @@ COUNT():long | category:keyword ; on const null -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT(), SUM(event_duration) BY category=CATEGORIZE(null) @@ -400,7 +400,7 @@ COUNT():long | SUM(event_duration):long | category:keyword ; on null row -required_capability: categorize_v5 +required_capability: categorize_v6 ROW message = null, str = ["a", "b", "c"] | STATS COUNT(), VALUES(str) BY category=CATEGORIZE(message) @@ -411,7 +411,7 @@ COUNT():long | VALUES(str):keyword | category:keyword ; filtering out all data -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | WHERE @timestamp < "2023-10-23T00:00:00Z" @@ -423,7 +423,7 @@ COUNT():long | category:keyword ; filtering out all data with constant -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT() BY category=CATEGORIZE(message) @@ -434,7 +434,7 @@ COUNT():long | category:keyword ; drop output columns -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS count=COUNT() BY category=CATEGORIZE(message) @@ -449,7 +449,7 @@ x:integer ; category value processing -required_capability: categorize_v5 +required_capability: categorize_v6 ROW message = ["connected to a", "connected to b", "disconnected"] | STATS COUNT() BY category=CATEGORIZE(message) @@ -463,7 +463,7 @@ COUNT():long | category:keyword ; row aliases -required_capability: categorize_v5 +required_capability: categorize_v6 ROW message = "connected to xyz" | EVAL x = message @@ -477,7 +477,7 @@ COUNT():long | category:keyword | y:keyword ; from aliases -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | EVAL x = message @@ -493,7 +493,7 @@ COUNT():long | category:keyword | y:keyword ; row aliases with keep -required_capability: categorize_v5 +required_capability: categorize_v6 ROW message = "connected to xyz" | EVAL x = message @@ -509,7 +509,7 @@ COUNT():long | y:keyword ; from aliases with keep -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | EVAL x = message @@ -527,7 +527,7 @@ COUNT():long | y:keyword ; row rename -required_capability: categorize_v5 +required_capability: categorize_v6 ROW message = "connected to xyz" | RENAME message as x @@ -541,7 +541,7 @@ COUNT():long | y:keyword ; from rename -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | RENAME message as x @@ -557,7 +557,7 @@ COUNT():long | y:keyword ; row drop -required_capability: categorize_v5 +required_capability: categorize_v6 ROW message = "connected to a" | STATS c = COUNT() BY category=CATEGORIZE(message) @@ -570,7 +570,7 @@ c:long ; from drop -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS c = COUNT() BY category=CATEGORIZE(message) @@ -585,7 +585,7 @@ c:long ; reuse categorize arg expression in agg -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS m = MAX(LENGTH(CONCAT(message, "_end"))) BY c = CATEGORIZE(CONCAT(message, "_end")) @@ -600,7 +600,7 @@ m:integer |c:keyword categorize in aggs inside function -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT(), x = MV_APPEND(category, category) BY category=CATEGORIZE(message) @@ -615,7 +615,7 @@ COUNT():long | x:keyword ; categorize in aggs same as grouping inside function -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT(), x = MV_APPEND(CATEGORIZE(message), `CATEGORIZE(message)`) BY CATEGORIZE(message) @@ -630,7 +630,7 @@ COUNT():long | x:keyword ; categorize in aggs same as grouping inside function with explicit alias -required_capability: categorize_v5 +required_capability: categorize_v6 FROM sample_data | STATS COUNT(), x = MV_APPEND(CATEGORIZE(message), category) BY category=CATEGORIZE(message) diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/docs.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/docs.csv-spec index a6bb68de16b78..1d2fade118e35 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/docs.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/docs.csv-spec @@ -690,7 +690,7 @@ Bangalore | 9 | 72 ; docsCategorize -required_capability: categorize_v5 +required_capability: categorize_v6 // tag::docsCategorize[] FROM sample_data | STATS count=COUNT() BY category=CATEGORIZE(message) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index c298280dfa532..54487448a1d7c 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -665,7 +665,7 @@ public enum Cap { /** * Supported the text categorization function "CATEGORIZE". */ - CATEGORIZE_V5, + CATEGORIZE_V6, /** * Support for multiple groupings in "CATEGORIZE".