Skip to content

Commit c087f6f

Browse files
authored
LUCENE-9575 Add PatternTypingFilter to annotate tokens with flags and types (#1995)
LUCENE-9575 Add PatternTypingFilter
1 parent 9d4811e commit c087f6f

File tree

5 files changed

+343
-0
lines changed

5 files changed

+343
-0
lines changed
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.analysis.pattern;
19+
20+
import org.apache.lucene.analysis.TokenFilter;
21+
import org.apache.lucene.analysis.TokenStream;
22+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
23+
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
24+
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
25+
26+
import java.io.IOException;
27+
import java.util.regex.Matcher;
28+
import java.util.regex.Pattern;
29+
30+
/**
31+
* Set a type attribute to a parameterized value when tokens are matched by any of a several regex patterns. The
32+
* value set in the type attribute is parameterized with the match groups of the regex used for matching.
33+
* In combination with TypeAsSynonymFilter and DropIfFlagged filter this can supply complex synonym patterns
34+
* that are protected from subsequent analysis, and optionally drop the original term based on the flag
35+
* set in this filter. See {@link PatternTypingFilterFactory} for full documentation.
36+
*
37+
* @see PatternTypingFilterFactory
38+
* @since 8.8.0
39+
*/
40+
public class PatternTypingFilter extends TokenFilter {
41+
42+
private final PatternTypingRule[] replacementAndFlagByPattern;
43+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
44+
private final FlagsAttribute flagAtt = addAttribute(FlagsAttribute.class);
45+
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
46+
47+
public PatternTypingFilter(TokenStream input, PatternTypingRule... replacementAndFlagByPattern) {
48+
super(input);
49+
this.replacementAndFlagByPattern = replacementAndFlagByPattern;
50+
}
51+
52+
@Override
53+
public final boolean incrementToken() throws IOException {
54+
if (input.incrementToken()) {
55+
for (PatternTypingRule rule : replacementAndFlagByPattern) {
56+
Matcher matcher = rule.getPattern().matcher(termAtt);
57+
if (matcher.find()) {
58+
// allow 2nd reset() and find() that occurs inside replaceFirst to avoid excess string creation
59+
typeAtt.setType(matcher.replaceFirst(rule.getTypeTemplate()));
60+
flagAtt.setFlags(rule.getFlags());
61+
return true;
62+
}
63+
}
64+
return true;
65+
}
66+
return false;
67+
}
68+
69+
/**
70+
* Value holding class for pattern typing rules.
71+
*/
72+
public static class PatternTypingRule {
73+
private final Pattern pattern;
74+
private final int flags;
75+
private final String typeTemplate;
76+
77+
public PatternTypingRule(Pattern pattern, int flags, String typeTemplate) {
78+
this.pattern = pattern;
79+
this.flags = flags;
80+
this.typeTemplate = typeTemplate;
81+
}
82+
83+
public Pattern getPattern() {
84+
return pattern;
85+
}
86+
87+
public int getFlags() {
88+
return flags;
89+
}
90+
91+
public String getTypeTemplate() {
92+
return typeTemplate;
93+
}
94+
}
95+
}
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.analysis.pattern;
19+
20+
import org.apache.lucene.analysis.TokenFilterFactory;
21+
import org.apache.lucene.analysis.TokenStream;
22+
import org.apache.lucene.analysis.pattern.PatternTypingFilter.PatternTypingRule;
23+
import org.apache.lucene.util.ResourceLoader;
24+
import org.apache.lucene.util.ResourceLoaderAware;
25+
26+
import java.io.IOException;
27+
import java.util.ArrayList;
28+
import java.util.List;
29+
import java.util.Map;
30+
import java.util.regex.Pattern;
31+
32+
33+
/**
34+
* Provides a filter that will analyze tokens with the analyzer from an arbitrary field type. By itself this
35+
* filter is not very useful. Normally it is combined with a filter that reacts to types or flags.
36+
*
37+
* <pre class="prettyprint" >
38+
* &lt;fieldType name="text_taf" class="solr.TextField" positionIncrementGap="100"&gt;
39+
* &lt;analyzer&gt;
40+
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
41+
* &lt;filter class="com.example.PatternTypingFilter" patternFile="patterns.txt"/&gt;
42+
* &lt;filter class="solr.TokenAnalyzerFilter" asType="text_en" preserveType="true"/&gt;
43+
* &lt;filter class="solr.TypeAsSynonymFilterFactory" prefix="__TAS__"
44+
* ignore="word,&amp;lt;ALPHANUM&amp;gt;,&amp;lt;NUM&amp;gt;,&amp;lt;SOUTHEAST_ASIAN&amp;gt;,&amp;lt;IDEOGRAPHIC&amp;gt;,&amp;lt;HIRAGANA&amp;gt;,&amp;lt;KATAKANA&amp;gt;,&amp;lt;HANGUL&amp;gt;,&amp;lt;EMOJI&amp;gt;"/&gt;
45+
* &lt;/analyzer&gt;
46+
* &lt;/fieldType&gt;</pre>
47+
* <p>
48+
* Note that a configuration such as above may interfere with multi-word synonyms. The patterns file has the format:
49+
* <pre>
50+
* (flags) (pattern) ::: (replacement)
51+
* </pre>
52+
* Therefore to set the first 2 flag bits on the original token matching 401k or 401(k) and adding a type of
53+
* 'legal2_401_k' whenever either one is encountered one would use:
54+
* <pre>
55+
* 3 (\d+)\(?([a-z])\)? ::: legal2_$1_$2
56+
* </pre>
57+
* Note that the number indicating the flag bits to set must not have leading spaces and be followed by a single
58+
* space, and must be 0 if no flags should be set. The flags number should not contain commas or a decimal point.
59+
* Lines for which the first character is <code>#</code> will be ignored as comments. Does not support producing
60+
* a synonym textually identical to the original term.
61+
*
62+
* @lucene.spi {@value #NAME}
63+
* @since 8.8
64+
*/
65+
public class PatternTypingFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
66+
67+
/**
68+
* SPI name
69+
*/
70+
public static final String NAME = "patternTyping";
71+
72+
private final String patternFile;
73+
private PatternTypingRule[] rules;
74+
75+
/**
76+
* Creates a new PatternTypingFilterFactory
77+
*/
78+
public PatternTypingFilterFactory(Map<String, String> args) {
79+
super(args);
80+
patternFile = require(args, "patternFile");
81+
if (!args.isEmpty()) {
82+
throw new IllegalArgumentException("Unknown parameters: " + args);
83+
}
84+
}
85+
86+
/**
87+
* Default ctor for compatibility with SPI
88+
*/
89+
public PatternTypingFilterFactory() {
90+
throw defaultCtorException();
91+
}
92+
93+
@Override
94+
public void inform(ResourceLoader loader) throws IOException {
95+
List<PatternTypingRule> ruleList = new ArrayList<>();
96+
List<String> lines = getLines(loader, patternFile);
97+
// format: # regex ::: typename[_$1[_$2 ...]] (technically _$1 does not need the '_' but it usually makes sense)
98+
// eg: 2 (\d+\(?([a-z])\)?\(?(\d+)\)? ::: legal3_$1_$2_3
99+
// which yields legal3_501_c_3 for 501(c)(3) or 501c3 and sets the second lowest bit in flags
100+
for (String line : lines) {
101+
int firstSpace = line.indexOf(" "); // no leading spaces allowed
102+
int flagsVal = Integer.parseInt(line.substring(0, firstSpace));
103+
line = line.substring(firstSpace + 1);
104+
String[] split = line.split(" ::: "); // arbitrary, unlikely to occur in a useful regex easy to read
105+
if (split.length != 2) {
106+
throw new RuntimeException("The PatternTypingFilter: Always two there are, no more, no less, a pattern and a replacement (separated by ' ::: ' )");
107+
}
108+
Pattern compiled = Pattern.compile(split[0]);
109+
ruleList.add(new PatternTypingRule(compiled, flagsVal, split[1]));
110+
}
111+
this.rules = ruleList.toArray(new PatternTypingRule[0]);
112+
}
113+
114+
@Override
115+
public TokenStream create(TokenStream input) {
116+
return new PatternTypingFilter(input, rules);
117+
}
118+
}

lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
9797
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
9898
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
9999
org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
100+
org.apache.lucene.analysis.pattern.PatternTypingFilterFactory
100101
org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory
101102
org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory
102103
org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.analysis.pattern;
18+
19+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
20+
import org.apache.lucene.analysis.CannedTokenStream;
21+
import org.apache.lucene.analysis.Token;
22+
import org.apache.lucene.analysis.TokenStream;
23+
import org.apache.lucene.analysis.pattern.PatternTypingFilter.PatternTypingRule;
24+
25+
import java.io.IOException;
26+
import java.util.regex.Pattern;
27+
28+
/**
29+
* Test that this filter sets a type for tokens matching patterns defined in a patterns.txt file
30+
*/
31+
public class TestPatternTypingFilter extends BaseTokenStreamTestCase {
32+
33+
/**
34+
* Test the straight forward cases. When all flags match the token should be dropped
35+
*/
36+
public void testPatterns() throws Exception {
37+
38+
Token tokenA1 = new Token("One", 0, 2);
39+
Token tokenA2 = new Token("401(k)", 4, 9);
40+
Token tokenA3 = new Token("two", 11, 13);
41+
Token tokenB1 = new Token("three", 15, 19);
42+
Token tokenB2 = new Token("401k", 21, 24);
43+
44+
TokenStream ts = new CannedTokenStream(tokenA1, tokenA2, tokenA3, tokenB1, tokenB2);
45+
46+
//2 ^(\d+)\(?([a-z])\)?$ ::: legal2_$1_$2
47+
ts = new PatternTypingFilter(ts,
48+
new PatternTypingRule(Pattern.compile("^(\\d+)\\(?([a-z])\\)?$"),2,"legal2_$1_$2"));
49+
50+
assertTokenStreamContents(ts, new String[]{
51+
"One", "401(k)", "two", "three", "401k"}, null, null,
52+
new String[]{"word", "legal2_401_k", "word", "word", "legal2_401_k"},
53+
null, null, null, null, null, false, null,
54+
new int[]{0, 2, 0, 0, 2});
55+
}
56+
57+
public void testFirstPatternWins() throws IOException {
58+
Token tokenA1 = new Token("One", 0, 2);
59+
Token tokenA3 = new Token("forty-two", 11, 13);
60+
Token tokenB1 = new Token("4-2", 15, 19);
61+
62+
TokenStream ts = new CannedTokenStream(tokenA1, tokenA3, tokenB1);
63+
64+
//2 ^(\d+)\(?([a-z])\)?$ ::: legal2_$1_$2
65+
PatternTypingRule p1 = new PatternTypingRule(Pattern.compile("^(\\d+)-(\\d+)$"), 6, "$1_hnum_$2");
66+
PatternTypingRule p2 = new PatternTypingRule(Pattern.compile("^(\\w+)-(\\w+)$"), 2, "$1_hword_$2");
67+
68+
ts = new PatternTypingFilter(ts, p1,p2); // 101
69+
70+
assertTokenStreamContents(ts, new String[]{
71+
"One", "forty-two", "4-2"}, null, null,
72+
new String[]{"word", "forty_hword_two", "4_hnum_2"},
73+
null, null, null, null, null, false, null,
74+
new int[]{0, 2, 6});
75+
}
76+
77+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.analysis.pattern;
19+
20+
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
21+
import org.apache.lucene.analysis.CannedTokenStream;
22+
import org.apache.lucene.analysis.Token;
23+
import org.apache.lucene.analysis.TokenFilterFactory;
24+
import org.apache.lucene.analysis.TokenStream;
25+
import org.apache.lucene.analysis.util.StringMockResourceLoader;
26+
import org.apache.lucene.util.Version;
27+
28+
/**
29+
* This test just ensures the factory works
30+
*/
31+
public class TestPatternTypingFilterFactory extends BaseTokenStreamFactoryTestCase {
32+
33+
public void testFactory() throws Exception {
34+
Token tokenA1 = new Token("One", 0, 2);
35+
Token tokenA3 = new Token("forty-two", 11, 13);
36+
Token tokenB1 = new Token("4-2", 15, 19);
37+
38+
TokenStream ts = new CannedTokenStream(tokenA1, tokenA3, tokenB1);
39+
40+
TokenFilterFactory tokenFilterFactory = tokenFilterFactory("patternTyping", Version.LATEST, new StringMockResourceLoader(
41+
"6 \\b(\\d+)-(\\d+) ::: $1_hnum_$2\n" +
42+
"2 \\b(\\w+)-(\\w+) ::: $1_hword_$2"
43+
), "patternFile", "patterns.txt");
44+
45+
ts = tokenFilterFactory.create(ts);
46+
assertTokenStreamContents(ts, new String[]{
47+
"One", "forty-two", "4-2"}, null, null,
48+
new String[]{"word", "forty_hword_two", "4_hnum_2"},
49+
null, null, null, null, null, false, null,
50+
new int[]{0, 2, 6});
51+
}
52+
}

0 commit comments

Comments
 (0)