11
11
import org .apache .lucene .analysis .LowerCaseFilter ;
12
12
import org .apache .lucene .analysis .TokenStream ;
13
13
import org .apache .lucene .analysis .Tokenizer ;
14
- import org .apache .lucene .analysis .pattern .PatternTokenizer ;
15
- import org .elasticsearch .common .regex .Regex ;
14
+ import org .apache .lucene .analysis .util .CharTokenizer ;
16
15
import org .elasticsearch .index .analysis .AnalyzerScope ;
17
16
import org .elasticsearch .index .analysis .NamedAnalyzer ;
18
17
19
- import java .util .regex .Pattern ;
20
-
21
18
/**
22
19
* An analyzer that tokenizes text by a pre-defined list of delimiters that work well for log messages.
23
20
* The pre-defined list of delimiters is: whitespace characters, =, ?, :, [, ], {, }, ", \, '
@@ -26,15 +23,11 @@ public final class DelimiterAnalyzer extends Analyzer {
26
23
27
24
static final NamedAnalyzer INSTANCE = new NamedAnalyzer ("delimiter" , AnalyzerScope .GLOBAL , new DelimiterAnalyzer ());
28
25
29
- private final Pattern pattern ;
30
-
31
- private DelimiterAnalyzer () {
32
- this .pattern = Regex .compile ("[\\ s\\ =\\ ?\\ :\\ [\\ ]\\ {\\ }\\ \" \\ \\ \\ ']" , null );
33
- }
26
+ private DelimiterAnalyzer () {}
34
27
35
28
@ Override
36
29
protected TokenStreamComponents createComponents (String s ) {
37
- final Tokenizer tokenizer = new PatternTokenizer ( pattern , - 1 );
30
+ final Tokenizer tokenizer = new DelimiterTokenizer ( );
38
31
TokenStream stream = new LowerCaseFilter (tokenizer );
39
32
return new TokenStreamComponents (tokenizer , stream );
40
33
}
@@ -45,4 +38,30 @@ protected TokenStream normalize(String fieldName, TokenStream in) {
45
38
stream = new LowerCaseFilter (stream );
46
39
return stream ;
47
40
}
41
+
42
+ static final class DelimiterTokenizer extends CharTokenizer {
43
+
44
+ DelimiterTokenizer () {
45
+ super (TokenStream .DEFAULT_TOKEN_ATTRIBUTE_FACTORY );
46
+ }
47
+
48
+ @ Override
49
+ protected boolean isTokenChar (int c ) {
50
+ if (Character .isWhitespace (c )
51
+ || c == '='
52
+ || c == '?'
53
+ || c == ':'
54
+ || c == '['
55
+ || c == ']'
56
+ || c == '{'
57
+ || c == '}'
58
+ || c == '"'
59
+ || c == '\\'
60
+ || c == '\'' ) {
61
+ return false ;
62
+ } else {
63
+ return true ;
64
+ }
65
+ }
66
+ }
48
67
}
0 commit comments