|
3 | 3 | import org.apache.lucene.analysis.TokenStream; |
4 | 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
5 | 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| 6 | +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| 7 | +import java.util.stream.Collectors; |
6 | 8 | import org.junit.Test; |
7 | 9 | import org.wltea.analyzer.cfg.Configuration; |
8 | 10 | import org.wltea.analyzer.TestUtils; |
| 11 | +import org.wltea.analyzer.core.Lexeme; |
9 | 12 |
|
10 | 13 | import java.util.ArrayList; |
11 | 14 | import java.util.Arrays; |
@@ -172,4 +175,122 @@ static String[] tokenize(Configuration configuration, String s) |
172 | 175 | } |
173 | 176 | return tokens.toArray(new String[0]); |
174 | 177 | } |
| 178 | + |
| 179 | + /** |
| 180 | + * 用ik_max_word分词器分词,测试中文量词 |
| 181 | + */ |
| 182 | + @Test |
| 183 | + public void tokenize_CN_Quantifier_correctly() |
| 184 | + { |
| 185 | + Configuration cfg = TestUtils.createFakeConfigurationSub(false); |
| 186 | + String text = "2023年人才"; |
| 187 | + |
| 188 | + // 获取分词结果和类型 |
| 189 | + List<TokenInfo> tokenInfos = tokenizeWithType(cfg, text); |
| 190 | + |
| 191 | + // 打印所有分词结果和类型,便于调试 |
| 192 | + for (TokenInfo info : tokenInfos) { |
| 193 | + System.out.println("Token: " + info.getText() + ", Type: " + info.getType()); |
| 194 | + } |
| 195 | + |
| 196 | + // 验证分词结果包含预期的词 |
| 197 | + List<String> tokens = tokenInfos.stream().map(TokenInfo::getText).collect(Collectors.toList()); |
| 198 | + assert tokens.contains("2023"); |
| 199 | + assert tokens.contains("年"); |
| 200 | + assert tokens.contains("人才"); |
| 201 | + |
| 202 | + // 验证"人"不会被单独分割成COUNT类型 |
| 203 | + boolean hasPersonAsCount = tokenInfos.stream() |
| 204 | + .anyMatch(info -> "人".equals(info.getText()) && info.getType() == Lexeme.TYPE_COUNT); |
| 205 | + assert !hasPersonAsCount : "'人'不应该被分割为COUNT类型"; |
| 206 | + |
| 207 | + // 验证"年"是量词类型 |
| 208 | + boolean hasYearAsCount = tokenInfos.stream() |
| 209 | + .anyMatch(info -> "年".equals(info.getText()) && info.getType() == Lexeme.TYPE_COUNT); |
| 210 | + assert hasYearAsCount : "'年'应该是COUNT类型"; |
| 211 | + } |
| 212 | + |
| 213 | + |
| 214 | +/** |
| 215 | + * 分词结果信息类,包含词文本和类型 |
| 216 | + */ |
| 217 | + static class TokenInfo { |
| 218 | + private String text; |
| 219 | + private int type; |
| 220 | + |
| 221 | + public TokenInfo(String text, int type) { |
| 222 | + this.text = text; |
| 223 | + this.type = type; |
| 224 | + } |
| 225 | + |
| 226 | + public String getText() { |
| 227 | + return text; |
| 228 | + } |
| 229 | + |
| 230 | + public int getType() { |
| 231 | + return type; |
| 232 | + } |
| 233 | + } |
| 234 | + |
| 235 | + /** |
| 236 | + * 获取分词结果及其类型信息 |
| 237 | + */ |
| 238 | + static List<TokenInfo> tokenizeWithType(Configuration configuration, String s) { |
| 239 | + ArrayList<TokenInfo> tokenInfos = new ArrayList<>(); |
| 240 | + try (IKAnalyzer ikAnalyzer = new IKAnalyzer(configuration)) { |
| 241 | + TokenStream tokenStream = ikAnalyzer.tokenStream("text", s); |
| 242 | + tokenStream.reset(); |
| 243 | + |
| 244 | + CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); |
| 245 | + OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); |
| 246 | + TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); |
| 247 | + |
| 248 | + while(tokenStream.incrementToken()) { |
| 249 | + int len = offsetAttribute.endOffset() - offsetAttribute.startOffset(); |
| 250 | + char[] chars = new char[len]; |
| 251 | + System.arraycopy(charTermAttribute.buffer(), 0, chars, 0, len); |
| 252 | + String text = new String(chars); |
| 253 | + |
| 254 | + // 获取类型信息并映射回对应的数字常量 |
| 255 | + String typeStr = typeAttribute.type(); |
| 256 | + int type = mapTypeStringToInt(typeStr); |
| 257 | + |
| 258 | + tokenInfos.add(new TokenInfo(text, type)); |
| 259 | + } |
| 260 | + } catch (Exception ex) { |
| 261 | + throw new RuntimeException(ex); |
| 262 | + } |
| 263 | + return tokenInfos; |
| 264 | + } |
| 265 | + |
| 266 | + /** |
| 267 | + * 将类型字符串映射为对应的数字常量 |
| 268 | + * |
| 269 | + * @param typeStr 类型字符串 |
| 270 | + * @return 对应的数字常量 |
| 271 | + */ |
| 272 | + private static int mapTypeStringToInt(String typeStr) { |
| 273 | + switch (typeStr) { |
| 274 | + case "ENGLISH": |
| 275 | + return Lexeme.TYPE_ENGLISH; |
| 276 | + case "ARABIC": |
| 277 | + return Lexeme.TYPE_ARABIC; |
| 278 | + case "LETTER": |
| 279 | + return Lexeme.TYPE_LETTER; |
| 280 | + case "CN_WORD": |
| 281 | + return Lexeme.TYPE_CNWORD; |
| 282 | + case "CN_CHAR": |
| 283 | + return Lexeme.TYPE_CNCHAR; |
| 284 | + case "OTHER_CJK": |
| 285 | + return Lexeme.TYPE_OTHER_CJK; |
| 286 | + case "COUNT": |
| 287 | + return Lexeme.TYPE_COUNT; |
| 288 | + case "TYPE_CNUM": |
| 289 | + return Lexeme.TYPE_CNUM; |
| 290 | + case "TYPE_CQUAN": |
| 291 | + return Lexeme.TYPE_CQUAN; |
| 292 | + default: |
| 293 | + return Lexeme.TYPE_UNKNOWN; |
| 294 | + } |
| 295 | + } |
175 | 296 | } |
0 commit comments