Skip to content

Commit 45a8766

Browse files
authored
Replace PatternTokenizer with a CharTokenizer (#134992)
to avoid relying on java regex.
1 parent 51613ec commit 45a8766

File tree

1 file changed

+29
-10
lines changed

1 file changed

+29
-10
lines changed

x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patterntext/DelimiterAnalyzer.java

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,10 @@
1111
import org.apache.lucene.analysis.LowerCaseFilter;
1212
import org.apache.lucene.analysis.TokenStream;
1313
import org.apache.lucene.analysis.Tokenizer;
14-
import org.apache.lucene.analysis.pattern.PatternTokenizer;
15-
import org.elasticsearch.common.regex.Regex;
14+
import org.apache.lucene.analysis.util.CharTokenizer;
1615
import org.elasticsearch.index.analysis.AnalyzerScope;
1716
import org.elasticsearch.index.analysis.NamedAnalyzer;
1817

19-
import java.util.regex.Pattern;
20-
2118
/**
2219
* An analyzer that tokenizes text by a pre-defined list of delimiters that work well for log messages.
2320
* The pre-defined list of delimiters is: whitespace characters, =, ?, :, [, ], {, }, ", \, '
@@ -26,15 +23,11 @@ public final class DelimiterAnalyzer extends Analyzer {
2623

2724
static final NamedAnalyzer INSTANCE = new NamedAnalyzer("delimiter", AnalyzerScope.GLOBAL, new DelimiterAnalyzer());
2825

29-
private final Pattern pattern;
30-
31-
private DelimiterAnalyzer() {
32-
this.pattern = Regex.compile("[\\s\\=\\?\\:\\[\\]\\{\\}\\\"\\\\\\']", null);
33-
}
26+
private DelimiterAnalyzer() {}
3427

3528
@Override
3629
protected TokenStreamComponents createComponents(String s) {
37-
final Tokenizer tokenizer = new PatternTokenizer(pattern, -1);
30+
final Tokenizer tokenizer = new DelimiterTokenizer();
3831
TokenStream stream = new LowerCaseFilter(tokenizer);
3932
return new TokenStreamComponents(tokenizer, stream);
4033
}
@@ -45,4 +38,30 @@ protected TokenStream normalize(String fieldName, TokenStream in) {
4538
stream = new LowerCaseFilter(stream);
4639
return stream;
4740
}
41+
42+
static final class DelimiterTokenizer extends CharTokenizer {
43+
44+
DelimiterTokenizer() {
45+
super(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY);
46+
}
47+
48+
@Override
49+
protected boolean isTokenChar(int c) {
50+
if (Character.isWhitespace(c)
51+
|| c == '='
52+
|| c == '?'
53+
|| c == ':'
54+
|| c == '['
55+
|| c == ']'
56+
|| c == '{'
57+
|| c == '}'
58+
|| c == '"'
59+
|| c == '\\'
60+
|| c == '\'') {
61+
return false;
62+
} else {
63+
return true;
64+
}
65+
}
66+
}
4867
}

0 commit comments

Comments
 (0)