Skip to content

Commit b00f8fa

Browse files
jan-elasticelasticsearchmachine
andauthored
Use standard tokenizer in ES|QL categorize (#129642) (#129688)
* Use standard tokenizer in ES|QL categorize * rename capability * remove unused param * [CI] Auto commit changes from spotless --------- Co-authored-by: elasticsearchmachine <[email protected]>
1 parent 2cb2308 commit b00f8fa

File tree

5 files changed

+72
-58
lines changed

5 files changed

+72
-58
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,22 @@ public static CategorizationAnalyzerConfig buildStandardCategorizationAnalyzer(L
210210
.build();
211211
}
212212

213+
/**
214+
* Create a <code>categorization_analyzer</code> that will be used by the ES|QL categorize function.
215+
* The only difference from the DSL analyzer is the tokenizer (standard instead of ml_standard).
216+
* This means the results are slightly different from the categorize text aggregation and the ML job,
217+
* however you can use these tokens for looking up messages in indices generated with the standard
218+
* tokenizer. The latter is considered more important.
219+
*/
220+
public static CategorizationAnalyzerConfig buildStandardEsqlCategorizationAnalyzer() {
221+
222+
return new CategorizationAnalyzerConfig.Builder().addCharFilter("first_line_with_letters")
223+
.setTokenizer("standard")
224+
.addDateWordsTokenFilter()
225+
.addLimitFilter()
226+
.build();
227+
}
228+
213229
private final String analyzer;
214230
private final List<NameOrDefinition> charFilters;
215231
private final NameOrDefinition tokenizer;

x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/blockhash/CategorizeBlockHash.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939

4040
import java.io.IOException;
4141
import java.util.HashMap;
42-
import java.util.List;
4342
import java.util.Map;
4443
import java.util.Objects;
4544

@@ -48,9 +47,8 @@
4847
*/
4948
public class CategorizeBlockHash extends BlockHash {
5049

51-
private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig.buildStandardCategorizationAnalyzer(
52-
List.of()
53-
);
50+
private static final CategorizationAnalyzerConfig ANALYZER_CONFIG = CategorizationAnalyzerConfig
51+
.buildStandardEsqlCategorizationAnalyzer();
5452
private static final int NULL_ORD = 0;
5553

5654
private final int channel;

0 commit comments

Comments
 (0)