fix: fix CN_Quantifier problem in ISSUE 1108 and add testcode (#1109)

kin122 · web-flow · commit f8b9e078da94 · 2025-05-20T10:15:58.000+08:00
diff --git a/core/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java b/core/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java
@@ -168,27 +168,39 @@ private void processCount(AnalyzeContext context){
 						this.countHits.remove(hit);
 					}					
 				}
-			}				
+			}		
+			
+			// 检查是否应该进行**新的**字量词匹配
+			// 只有在前面有数词的情况下才进行单字量词匹配
+			boolean shouldMatchSingleChar = false;
+			if(!context.getOrgLexemes().isEmpty()){
+				Lexeme l = context.getOrgLexemes().peekLast();
+				if((Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType())
+					&& (l.getBegin() + l.getLength() == context.getCursor())){
+					shouldMatchSingleChar = true;
+				}
+			}
+			if(shouldMatchSingleChar || !this.countHits.isEmpty()){
 
-			//*********************************
-			//对当前指针位置的字符进行单字匹配
-			Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
-			if(singleCharHit.isMatch()){//首字成量词词
-				//输出当前的词
-				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
-				context.addLexeme(newLexeme);
+				//*********************************
+				//对当前指针位置的字符进行单字匹配
+				Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
+				if(singleCharHit.isMatch()){//首字成量词词
+					//输出当前的词
+					Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
+					context.addLexeme(newLexeme);
 
-				//同时也是词前缀
-				if(singleCharHit.isPrefix()){
+					//同时也是词前缀
+					if(singleCharHit.isPrefix()){
+						//前缀匹配则放入hit列表
+						this.countHits.add(singleCharHit);
+					}
+				}else if(singleCharHit.isPrefix()){//首字为量词前缀
 					//前缀匹配则放入hit列表
 					this.countHits.add(singleCharHit);
 				}
-			}else if(singleCharHit.isPrefix()){//首字为量词前缀
-				//前缀匹配则放入hit列表
-				this.countHits.add(singleCharHit);
 			}
 			
-			
 		}else{
 			//输入的不是中文字符
 			//清空未成形的量词
diff --git a/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java b/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java
@@ -3,9 +3,12 @@
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import java.util.stream.Collectors;
 import org.junit.Test;
 import org.wltea.analyzer.cfg.Configuration;
 import org.wltea.analyzer.TestUtils;
+import org.wltea.analyzer.core.Lexeme;
 
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -172,4 +175,122 @@ static String[] tokenize(Configuration configuration, String s)
         }
         return  tokens.toArray(new String[0]);
     }
+
+    /**
+     * 用ik_max_word分词器分词，测试中文量词
+     */
+    @Test
+    public void tokenize_CN_Quantifier_correctly()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(false);
+        String text = "2023年人才";
+        
+        // 获取分词结果和类型
+        List<TokenInfo> tokenInfos = tokenizeWithType(cfg, text);
+        
+        // 打印所有分词结果和类型，便于调试
+        for (TokenInfo info : tokenInfos) {
+            System.out.println("Token: " + info.getText() + ", Type: " + info.getType());
+        }
+        
+        // 验证分词结果包含预期的词
+        List<String> tokens = tokenInfos.stream().map(TokenInfo::getText).collect(Collectors.toList());
+        assert tokens.contains("2023");
+        assert tokens.contains("年");
+        assert tokens.contains("人才");
+        
+        // 验证"人"不会被单独分割成COUNT类型
+        boolean hasPersonAsCount = tokenInfos.stream()
+                .anyMatch(info -> "人".equals(info.getText()) && info.getType() == Lexeme.TYPE_COUNT);
+        assert !hasPersonAsCount : "'人'不应该被分割为COUNT类型";
+        
+        // 验证"年"是量词类型
+        boolean hasYearAsCount = tokenInfos.stream()
+                .anyMatch(info -> "年".equals(info.getText()) && info.getType() == Lexeme.TYPE_COUNT);
+        assert hasYearAsCount : "'年'应该是COUNT类型";
+    }
+
+
+/**
+     * 分词结果信息类，包含词文本和类型
+     */
+    static class TokenInfo {
+        private String text;
+        private int type;
+        
+        public TokenInfo(String text, int type) {
+            this.text = text;
+            this.type = type;
+        }
+        
+        public String getText() {
+            return text;
+        }
+        
+        public int getType() {
+            return type;
+        }
+    }
+    
+    /**
+     * 获取分词结果及其类型信息
+     */
+    static List<TokenInfo> tokenizeWithType(Configuration configuration, String s) {
+        ArrayList<TokenInfo> tokenInfos = new ArrayList<>();
+        try (IKAnalyzer ikAnalyzer = new IKAnalyzer(configuration)) {
+            TokenStream tokenStream = ikAnalyzer.tokenStream("text", s);
+            tokenStream.reset();
+            
+            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
+            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
+            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);
+            
+            while(tokenStream.incrementToken()) {
+                int len = offsetAttribute.endOffset() - offsetAttribute.startOffset();
+                char[] chars = new char[len];
+                System.arraycopy(charTermAttribute.buffer(), 0, chars, 0, len);
+                String text = new String(chars);
+                
+                // 获取类型信息并映射回对应的数字常量
+                String typeStr = typeAttribute.type();
+                int type = mapTypeStringToInt(typeStr);
+                
+                tokenInfos.add(new TokenInfo(text, type));
+            }
+        } catch (Exception ex) {
+            throw new RuntimeException(ex);
+        }
+        return tokenInfos;
+    }
+    
+    /**
+     * 将类型字符串映射为对应的数字常量
+     * 
+     * @param typeStr 类型字符串
+     * @return 对应的数字常量
+     */
+    private static int mapTypeStringToInt(String typeStr) {
+        switch (typeStr) {
+            case "ENGLISH":
+                return Lexeme.TYPE_ENGLISH;
+            case "ARABIC":
+                return Lexeme.TYPE_ARABIC;
+            case "LETTER":
+                return Lexeme.TYPE_LETTER;
+            case "CN_WORD":
+                return Lexeme.TYPE_CNWORD;
+            case "CN_CHAR":
+                return Lexeme.TYPE_CNCHAR;
+            case "OTHER_CJK":
+                return Lexeme.TYPE_OTHER_CJK;
+            case "COUNT":
+                return Lexeme.TYPE_COUNT;
+            case "TYPE_CNUM":
+                return Lexeme.TYPE_CNUM;
+            case "TYPE_CQUAN":
+                return Lexeme.TYPE_CQUAN;
+            default:
+                return Lexeme.TYPE_UNKNOWN;
+        }
+    }
 }