Skip to content

Commit f8b9e07

Browse files
authored
fix: fix CN_Quantifier problem in ISSUE 1108 and add testcode (#1109)
1 parent c4a00e7 commit f8b9e07

File tree

2 files changed

+147
-14
lines changed

2 files changed

+147
-14
lines changed

core/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -168,27 +168,39 @@ private void processCount(AnalyzeContext context){
168168
this.countHits.remove(hit);
169169
}
170170
}
171-
}
171+
}
172+
173+
// 检查是否应该进行**新的**字量词匹配
174+
// 只有在前面有数词的情况下才进行单字量词匹配
175+
boolean shouldMatchSingleChar = false;
176+
if(!context.getOrgLexemes().isEmpty()){
177+
Lexeme l = context.getOrgLexemes().peekLast();
178+
if((Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType())
179+
&& (l.getBegin() + l.getLength() == context.getCursor())){
180+
shouldMatchSingleChar = true;
181+
}
182+
}
183+
if(shouldMatchSingleChar || !this.countHits.isEmpty()){
172184

173-
//*********************************
174-
//对当前指针位置的字符进行单字匹配
175-
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
176-
if(singleCharHit.isMatch()){//首字成量词词
177-
//输出当前的词
178-
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
179-
context.addLexeme(newLexeme);
185+
//*********************************
186+
//对当前指针位置的字符进行单字匹配
187+
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
188+
if(singleCharHit.isMatch()){//首字成量词词
189+
//输出当前的词
190+
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
191+
context.addLexeme(newLexeme);
180192

181-
//同时也是词前缀
182-
if(singleCharHit.isPrefix()){
193+
//同时也是词前缀
194+
if(singleCharHit.isPrefix()){
195+
//前缀匹配则放入hit列表
196+
this.countHits.add(singleCharHit);
197+
}
198+
}else if(singleCharHit.isPrefix()){//首字为量词前缀
183199
//前缀匹配则放入hit列表
184200
this.countHits.add(singleCharHit);
185201
}
186-
}else if(singleCharHit.isPrefix()){//首字为量词前缀
187-
//前缀匹配则放入hit列表
188-
this.countHits.add(singleCharHit);
189202
}
190203

191-
192204
}else{
193205
//输入的不是中文字符
194206
//清空未成形的量词

core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
import org.apache.lucene.analysis.TokenStream;
44
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
55
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
6+
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
7+
import java.util.stream.Collectors;
68
import org.junit.Test;
79
import org.wltea.analyzer.cfg.Configuration;
810
import org.wltea.analyzer.TestUtils;
11+
import org.wltea.analyzer.core.Lexeme;
912

1013
import java.util.ArrayList;
1114
import java.util.Arrays;
@@ -172,4 +175,122 @@ static String[] tokenize(Configuration configuration, String s)
172175
}
173176
return tokens.toArray(new String[0]);
174177
}
178+
179+
/**
180+
* 用ik_max_word分词器分词,测试中文量词
181+
*/
182+
@Test
183+
public void tokenize_CN_Quantifier_correctly()
184+
{
185+
Configuration cfg = TestUtils.createFakeConfigurationSub(false);
186+
String text = "2023年人才";
187+
188+
// 获取分词结果和类型
189+
List<TokenInfo> tokenInfos = tokenizeWithType(cfg, text);
190+
191+
// 打印所有分词结果和类型,便于调试
192+
for (TokenInfo info : tokenInfos) {
193+
System.out.println("Token: " + info.getText() + ", Type: " + info.getType());
194+
}
195+
196+
// 验证分词结果包含预期的词
197+
List<String> tokens = tokenInfos.stream().map(TokenInfo::getText).collect(Collectors.toList());
198+
assert tokens.contains("2023");
199+
assert tokens.contains("年");
200+
assert tokens.contains("人才");
201+
202+
// 验证"人"不会被单独分割成COUNT类型
203+
boolean hasPersonAsCount = tokenInfos.stream()
204+
.anyMatch(info -> "人".equals(info.getText()) && info.getType() == Lexeme.TYPE_COUNT);
205+
assert !hasPersonAsCount : "'人'不应该被分割为COUNT类型";
206+
207+
// 验证"年"是量词类型
208+
boolean hasYearAsCount = tokenInfos.stream()
209+
.anyMatch(info -> "年".equals(info.getText()) && info.getType() == Lexeme.TYPE_COUNT);
210+
assert hasYearAsCount : "'年'应该是COUNT类型";
211+
}
212+
213+
214+
/**
215+
* 分词结果信息类,包含词文本和类型
216+
*/
217+
static class TokenInfo {
218+
private String text;
219+
private int type;
220+
221+
public TokenInfo(String text, int type) {
222+
this.text = text;
223+
this.type = type;
224+
}
225+
226+
public String getText() {
227+
return text;
228+
}
229+
230+
public int getType() {
231+
return type;
232+
}
233+
}
234+
235+
/**
236+
* 获取分词结果及其类型信息
237+
*/
238+
static List<TokenInfo> tokenizeWithType(Configuration configuration, String s) {
239+
ArrayList<TokenInfo> tokenInfos = new ArrayList<>();
240+
try (IKAnalyzer ikAnalyzer = new IKAnalyzer(configuration)) {
241+
TokenStream tokenStream = ikAnalyzer.tokenStream("text", s);
242+
tokenStream.reset();
243+
244+
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
245+
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
246+
TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);
247+
248+
while(tokenStream.incrementToken()) {
249+
int len = offsetAttribute.endOffset() - offsetAttribute.startOffset();
250+
char[] chars = new char[len];
251+
System.arraycopy(charTermAttribute.buffer(), 0, chars, 0, len);
252+
String text = new String(chars);
253+
254+
// 获取类型信息并映射回对应的数字常量
255+
String typeStr = typeAttribute.type();
256+
int type = mapTypeStringToInt(typeStr);
257+
258+
tokenInfos.add(new TokenInfo(text, type));
259+
}
260+
} catch (Exception ex) {
261+
throw new RuntimeException(ex);
262+
}
263+
return tokenInfos;
264+
}
265+
266+
/**
267+
* 将类型字符串映射为对应的数字常量
268+
*
269+
* @param typeStr 类型字符串
270+
* @return 对应的数字常量
271+
*/
272+
private static int mapTypeStringToInt(String typeStr) {
273+
switch (typeStr) {
274+
case "ENGLISH":
275+
return Lexeme.TYPE_ENGLISH;
276+
case "ARABIC":
277+
return Lexeme.TYPE_ARABIC;
278+
case "LETTER":
279+
return Lexeme.TYPE_LETTER;
280+
case "CN_WORD":
281+
return Lexeme.TYPE_CNWORD;
282+
case "CN_CHAR":
283+
return Lexeme.TYPE_CNCHAR;
284+
case "OTHER_CJK":
285+
return Lexeme.TYPE_OTHER_CJK;
286+
case "COUNT":
287+
return Lexeme.TYPE_COUNT;
288+
case "TYPE_CNUM":
289+
return Lexeme.TYPE_CNUM;
290+
case "TYPE_CQUAN":
291+
return Lexeme.TYPE_CQUAN;
292+
default:
293+
return Lexeme.TYPE_UNKNOWN;
294+
}
295+
}
175296
}

0 commit comments

Comments
 (0)