Skip to content

Commit ce96ca2

Browse files
committed
Use standard tokenizer in ES|QL categorize
1 parent f56c6f1 commit ce96ca2

File tree

4 files changed

+26
-9
lines changed

4 files changed

+26
-9
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,23 @@ public static CategorizationAnalyzerConfig buildStandardCategorizationAnalyzer(L
210210
.build();
211211
}
212212

213+
/**
214+
* Create a <code>categorization_analyzer</code> that will be used by the ES|QL categorize function.
215+
* The only difference from the DSL analyzer is the tokenizer (standard instead of ml_standard).
216+
* This means the results are slightly different from the categorize text aggregation and the ML job,
217+
* however you can use these tokens for looking up messages in indices generated with the standard
218+
* tokenizer. The latter is considered more important.
219+
*/
220+
public static CategorizationAnalyzerConfig buildStandardEsqlCategorizationAnalyzer(List<String> categorizationFilters) {
221+
222+
return new CategorizationAnalyzerConfig.Builder().addCharFilter("first_line_with_letters")
223+
.addCategorizationFilters(categorizationFilters)
224+
.setTokenizer("standard")
225+
.addDateWordsTokenFilter()
226+
.addLimitFilter()
227+
.build();
228+
}
229+
213230
private final String analyzer;
214231
private final List<NameOrDefinition> charFilters;
215232
private final NameOrDefinition tokenizer;

x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
*/
4949
public class CategorizeBlockHash extends BlockHash {
5050

51-
private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig.buildStandardCategorizationAnalyzer(
51+
private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig.buildStandardEsqlCategorizationAnalyzer(
5252
List.of()
5353
);
5454
private static final int NULL_ORD = 0;

x-pack/plugin/esql/qa/testFixtures/src/main/resources/categorize.csv-spec

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -256,9 +256,9 @@ FROM hosts
256256
;
257257

258258
COUNT():long | category:keyword
259+
2 | .*?DB.+?servers.*?
259260
2 | .*?Gateway.+?instances.*?
260261
5 | .*?Kubernetes.+?cluster.*?
261-
2 | .*?servers.*?
262262
1 | null
263263

264264
// Note: DB is removed from "DB servers", because the ml_standard
@@ -302,9 +302,9 @@ FROM sample_data
302302
;
303303

304304
COUNT():long | category:keyword
305-
3 | .*?Connected.+?to.*?
306-
3 | .*?Connection.+?error.*?
307-
1 | .*?Disconnected.*?
305+
3 | .*?Connected.+?to.+?👍🏽.+?😊.*?
306+
3 | .*?Connection.+?error.+?👍🏽.+?😊.*?
307+
1 | .*?Disconnected.+?👍🏽.+?😊.*?
308308
;
309309

310310
on REVERSE(CONCAT())
@@ -316,9 +316,9 @@ FROM sample_data
316316
;
317317

318318
COUNT():long | category:keyword
319-
1 | .*?detcennocsiD.*?
320-
3 | .*?ot.+?detcennoC.*?
321-
3 | .*?rorre.+?noitcennoC.*?
319+
1 | .*?😊.+?👍🏽.+?detcennocsiD.*?
320+
3 | .*?😊.+?👍🏽.+?ot.+?detcennoC.*?
321+
3 | .*?😊.+?👍🏽.+?rorre.+?noitcennoC.*?
322322
;
323323

324324
and then TO_LOWER

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ public class CsvTests extends ESTestCase {
192192

193193
@ParametersFactory(argumentFormatting = "%2$s.%3$s")
194194
public static List<Object[]> readScriptSpec() throws Exception {
195-
List<URL> urls = classpathResources("/*.csv-spec");
195+
List<URL> urls = classpathResources("/categorize.csv-spec");
196196
assertThat("Not enough specs found " + urls, urls, hasSize(greaterThan(0)));
197197
return SpecReader.readScriptSpec(urls, specParser());
198198
}

0 commit comments

Comments
 (0)