1111import org .apache .lucene .analysis .LowerCaseFilter ;
1212import org .apache .lucene .analysis .TokenStream ;
1313import org .apache .lucene .analysis .Tokenizer ;
14- import org .apache .lucene .analysis .pattern .PatternTokenizer ;
15- import org .elasticsearch .common .regex .Regex ;
14+ import org .apache .lucene .analysis .util .CharTokenizer ;
1615import org .elasticsearch .index .analysis .AnalyzerScope ;
1716import org .elasticsearch .index .analysis .NamedAnalyzer ;
1817
19- import java .util .regex .Pattern ;
20-
2118/**
2219 * An analyzer that tokenizes text by a pre-defined list of delimiters that work well for log messages.
2320 * The pre-defined list of delimiters is: whitespace characters, =, ?, :, [, ], {, }, ", \, '
@@ -26,15 +23,11 @@ public final class DelimiterAnalyzer extends Analyzer {
2623
2724 static final NamedAnalyzer INSTANCE = new NamedAnalyzer ("delimiter" , AnalyzerScope .GLOBAL , new DelimiterAnalyzer ());
2825
29- private final Pattern pattern ;
30-
31- private DelimiterAnalyzer () {
32- this .pattern = Regex .compile ("[\\ s\\ =\\ ?\\ :\\ [\\ ]\\ {\\ }\\ \" \\ \\ \\ ']" , null );
33- }
26+ private DelimiterAnalyzer () {}
3427
3528 @ Override
3629 protected TokenStreamComponents createComponents (String s ) {
37- final Tokenizer tokenizer = new PatternTokenizer ( pattern , - 1 );
30+ final Tokenizer tokenizer = new DelimiterTokenizer ( );
3831 TokenStream stream = new LowerCaseFilter (tokenizer );
3932 return new TokenStreamComponents (tokenizer , stream );
4033 }
@@ -45,4 +38,30 @@ protected TokenStream normalize(String fieldName, TokenStream in) {
4538 stream = new LowerCaseFilter (stream );
4639 return stream ;
4740 }
41+
42+ static final class DelimiterTokenizer extends CharTokenizer {
43+
44+ DelimiterTokenizer () {
45+ super (TokenStream .DEFAULT_TOKEN_ATTRIBUTE_FACTORY );
46+ }
47+
48+ @ Override
49+ protected boolean isTokenChar (int c ) {
50+ if (Character .isWhitespace (c )
51+ || c == '='
52+ || c == '?'
53+ || c == ':'
54+ || c == '['
55+ || c == ']'
56+ || c == '{'
57+ || c == '}'
58+ || c == '"'
59+ || c == '\\'
60+ || c == '\'' ) {
61+ return false ;
62+ } else {
63+ return true ;
64+ }
65+ }
66+ }
4867}
0 commit comments