Skip to content

Commit 595674b

Browse files
committed
Use standard tokenizer in ES|QL categorize
1 parent f56c6f1 commit 595674b

File tree

3 files changed

+26
-10
lines changed

3 files changed

+26
-10
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,23 @@ public static CategorizationAnalyzerConfig buildStandardCategorizationAnalyzer(L
210210
.build();
211211
}
212212

213+
/**
214+
* Create a <code>categorization_analyzer</code> that will be used by the ES|QL categorize function.
215+
* The only difference from the DSL analyzer is the tokenizer (standard instead of ml_standard).
216+
* This means the results are slightly different from the categorize text aggregation and the ML job,
217+
* however you can use these tokens for looking up messages in indices generated with the standard
218+
* tokenizer. The latter is considered more important.
219+
*/
220+
public static CategorizationAnalyzerConfig buildStandardEsqlCategorizationAnalyzer(List<String> categorizationFilters) {
221+
222+
return new CategorizationAnalyzerConfig.Builder().addCharFilter("first_line_with_letters")
223+
.addCategorizationFilters(categorizationFilters)
224+
.setTokenizer("standard")
225+
.addDateWordsTokenFilter()
226+
.addLimitFilter()
227+
.build();
228+
}
229+
213230
private final String analyzer;
214231
private final List<NameOrDefinition> charFilters;
215232
private final NameOrDefinition tokenizer;

x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,8 @@
4848
*/
4949
public class CategorizeBlockHash extends BlockHash {
5050

51-
private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig.buildStandardCategorizationAnalyzer(
52-
List.of()
53-
);
51+
private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig
52+
.buildStandardEsqlCategorizationAnalyzer(List.of());
5453
private static final int NULL_ORD = 0;
5554

5655
private final int channel;

x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -256,9 +256,9 @@ FROM hosts
256256
;
257257

258258
COUNT():long | category:keyword
259+
2 | .*?DB.+?servers.*?
259260
2 | .*?Gateway.+?instances.*?
260261
5 | .*?Kubernetes.+?cluster.*?
261-
2 | .*?servers.*?
262262
1 | null
263263

264264
// Note: DB is removed from "DB servers", because the ml_standard
@@ -302,9 +302,9 @@ FROM sample_data
302302
;
303303

304304
COUNT():long | category:keyword
305-
3 | .*?Connected.+?to.*?
306-
3 | .*?Connection.+?error.*?
307-
1 | .*?Disconnected.*?
305+
3 | .*?Connected.+?to.+?👍🏽.+?😊.*?
306+
3 | .*?Connection.+?error.+?👍🏽.+?😊.*?
307+
1 | .*?Disconnected.+?👍🏽.+?😊.*?
308308
;
309309

310310
on REVERSE(CONCAT())
@@ -316,9 +316,9 @@ FROM sample_data
316316
;
317317

318318
COUNT():long | category:keyword
319-
1 | .*?detcennocsiD.*?
320-
3 | .*?ot.+?detcennoC.*?
321-
3 | .*?rorre.+?noitcennoC.*?
319+
1 | .*?😊.+?👍🏽.+?detcennocsiD.*?
320+
3 | .*?😊.+?👍🏽.+?ot.+?detcennoC.*?
321+
3 | .*?😊.+?👍🏽.+?rorre.+?noitcennoC.*?
322322
;
323323

324324
and then TO_LOWER

0 commit comments

Comments
 (0)