17
17
18
18
package org .apache .lucene .analysis .pattern ;
19
19
20
- import org .apache .lucene .analysis .TokenFilterFactory ;
21
- import org .apache .lucene .analysis .TokenStream ;
22
- import org .apache .lucene .analysis .pattern .PatternTypingFilter .PatternTypingRule ;
23
- import org .apache .lucene .util .ResourceLoader ;
24
- import org .apache .lucene .util .ResourceLoaderAware ;
25
-
26
20
import java .io .IOException ;
27
21
import java .util .ArrayList ;
28
22
import java .util .List ;
29
23
import java .util .Map ;
30
24
import java .util .regex .Pattern ;
31
-
25
+ import org .apache .lucene .analysis .TokenFilterFactory ;
26
+ import org .apache .lucene .analysis .TokenStream ;
27
+ import org .apache .lucene .analysis .pattern .PatternTypingFilter .PatternTypingRule ;
28
+ import org .apache .lucene .util .ResourceLoader ;
29
+ import org .apache .lucene .util .ResourceLoaderAware ;
32
30
33
31
/**
34
- * Provides a filter that will analyze tokens with the analyzer from an arbitrary field type. By itself this
35
- * filter is not very useful. Normally it is combined with a filter that reacts to types or flags.
32
+ * Provides a filter that will analyze tokens with the analyzer from an arbitrary field type. By
33
+ * itself this filter is not very useful. Normally it is combined with a filter that reacts to types
34
+ * or flags.
36
35
*
37
36
* <pre class="prettyprint" >
38
37
* <fieldType name="text_taf" class="solr.TextField" positionIncrementGap="100">
44
43
* ignore="word,&lt;ALPHANUM&gt;,&lt;NUM&gt;,&lt;SOUTHEAST_ASIAN&gt;,&lt;IDEOGRAPHIC&gt;,&lt;HIRAGANA&gt;,&lt;KATAKANA&gt;,&lt;HANGUL&gt;,&lt;EMOJI&gt;"/>
45
44
* </analyzer>
46
45
* </fieldType></pre>
47
- * <p>
48
- * Note that a configuration such as above may interfere with multi-word synonyms. The patterns file has the format:
46
+ *
47
+ * <p>Note that a configuration such as above may interfere with multi-word synonyms. The patterns
48
+ * file has the format:
49
+ *
49
50
* <pre>
50
51
* (flags) (pattern) ::: (replacement)
51
52
* </pre>
52
- * Therefore to set the first 2 flag bits on the original token matching 401k or 401(k) and adding a type of
53
- * 'legal2_401_k' whenever either one is encountered one would use:
53
+ *
54
+ * Therefore to set the first 2 flag bits on the original token matching 401k or 401(k) and adding a
55
+ * type of 'legal2_401_k' whenever either one is encountered one would use:
56
+ *
54
57
* <pre>
55
58
* 3 (\d+)\(?([a-z])\)? ::: legal2_$1_$2
56
59
* </pre>
57
- * Note that the number indicating the flag bits to set must not have leading spaces and be followed by a single
58
- * space, and must be 0 if no flags should be set. The flags number should not contain commas or a decimal point.
59
- * Lines for which the first character is <code>#</code> will be ignored as comments. Does not support producing
60
- * a synonym textually identical to the original term.
60
+ *
61
+ * Note that the number indicating the flag bits to set must not have leading spaces and be followed
62
+ * by a single space, and must be 0 if no flags should be set. The flags number should not contain
63
+ * commas or a decimal point. Lines for which the first character is <code>#</code> will be ignored
64
+ * as comments. Does not support producing a synonym textually identical to the original term.
61
65
*
62
66
* @lucene.spi {@value #NAME}
63
67
* @since 8.8
64
68
*/
65
69
public class PatternTypingFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
66
70
67
- /**
68
- * SPI name
69
- */
71
+ /** SPI name */
70
72
public static final String NAME = "patternTyping" ;
71
73
72
74
private final String patternFile ;
73
75
private PatternTypingRule [] rules ;
74
76
75
- /**
76
- * Creates a new PatternTypingFilterFactory
77
- */
77
+ /** Creates a new PatternTypingFilterFactory */
78
78
public PatternTypingFilterFactory (Map <String , String > args ) {
79
79
super (args );
80
80
patternFile = require (args , "patternFile" );
@@ -83,9 +83,7 @@ public PatternTypingFilterFactory(Map<String, String> args) {
83
83
}
84
84
}
85
85
86
- /**
87
- * Default ctor for compatibility with SPI
88
- */
86
+ /** Default ctor for compatibility with SPI */
89
87
public PatternTypingFilterFactory () {
90
88
throw defaultCtorException ();
91
89
}
@@ -94,16 +92,19 @@ public PatternTypingFilterFactory() {
94
92
public void inform (ResourceLoader loader ) throws IOException {
95
93
List <PatternTypingRule > ruleList = new ArrayList <>();
96
94
List <String > lines = getLines (loader , patternFile );
97
- // format: # regex ::: typename[_$1[_$2 ...]] (technically _$1 does not need the '_' but it usually makes sense)
95
+ // format: # regex ::: typename[_$1[_$2 ...]] (technically _$1 does not need the '_' but it
96
+ // usually makes sense)
98
97
// eg: 2 (\d+\(?([a-z])\)?\(?(\d+)\)? ::: legal3_$1_$2_3
99
98
// which yields legal3_501_c_3 for 501(c)(3) or 501c3 and sets the second lowest bit in flags
100
99
for (String line : lines ) {
101
100
int firstSpace = line .indexOf (" " ); // no leading spaces allowed
102
101
int flagsVal = Integer .parseInt (line .substring (0 , firstSpace ));
103
102
line = line .substring (firstSpace + 1 );
104
- String [] split = line .split (" ::: " ); // arbitrary, unlikely to occur in a useful regex easy to read
103
+ String [] split =
104
+ line .split (" ::: " ); // arbitrary, unlikely to occur in a useful regex easy to read
105
105
if (split .length != 2 ) {
106
- throw new RuntimeException ("The PatternTypingFilter: Always two there are, no more, no less, a pattern and a replacement (separated by ' ::: ' )" );
106
+ throw new RuntimeException (
107
+ "The PatternTypingFilter: Always two there are, no more, no less, a pattern and a replacement (separated by ' ::: ' )" );
107
108
}
108
109
Pattern compiled = Pattern .compile (split [0 ]);
109
110
ruleList .add (new PatternTypingRule (compiled , flagsVal , split [1 ]));
0 commit comments