Skip to content

Commit 8b04070

Browse files
author
gaozhicheng
committed
更新Lucene版本为8.3.1;
1 parent 4dd4a86 commit 8b04070

26 files changed

+413
-406
lines changed

README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,23 +22,24 @@ ik-analyzer for solr 7.x-8.x
2222
| word | 64.2万 | 2014年 |
2323
| jieba | 58.4万 | 2012年 |
2424
| jcesg | 16.6万 | 2018年 |
25-
| sougou词库 | 115.2万 | 2019年 |
25+
| sougou词库 | 115.2万 | 2020年 |
2626
#### 将以上词库进行整理后约187.1万条词汇;
2727
#### 添加动态加载词典表功能,在不需要重启solr服务的情况下加载新增的词典。
28+
> <small>关闭默认主词典请在`IKAnalyzer.cfg.xml`配置文件中设置`use_main_dict``false`。</small>
2829
* IKAnalyzer的原作者为林良益<linliangyi2007@gmail.com>,项目网站为<http://code.google.com/p/ik-analyzer>
2930
* 该项目动态加载功能根据博主[@星火燎原智勇](http://www.cnblogs.com/liang1101/articles/6395016.html)的博客进行修改,其GITHUB地址为[@liang68](https://github.com/liang68)
3031

3132

3233
## 使用说明
33-
* jar包下载地址:[![GitHub version](https://img.shields.io/badge/version-8.3.0-519dd9.svg)](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.3.0/ik-analyzer-8.3.0.jar)
34+
* jar包下载地址:[![GitHub version](https://img.shields.io/badge/version-8.3.1-519dd9.svg)](https://search.maven.org/remotecontent?filepath=com/github/magese/ik-analyzer/8.3.1/ik-analyzer-8.3.1.jar)
3435
* 历史版本:[![GitHub version](https://img.shields.io/maven-central/v/com.github.magese/ik-analyzer.svg?style=flat-square)](https://search.maven.org/search?q=g:com.github.magese%20AND%20a:ik-analyzer&core=gav)
3536

3637
```console
3738
<!-- Maven仓库地址 -->
3839
<dependency>
3940
<groupId>com.github.magese</groupId>
4041
<artifactId>ik-analyzer</artifactId>
41-
<version>8.3.0</version>
42+
<version>8.3.1</version>
4243
</dependency>
4344
```
4445

@@ -79,7 +80,7 @@ ik-analyzer for solr 7.x-8.x
7980
5. `IKAnalyzer.cfg.xml`配置文件说明:
8081

8182
| 名称 | 类型 | 描述 | 默认 |
82-
| :------: | :------: | :------: | :------: |
83+
| ------ | ------ | ------ | ------ |
8384
| use_main_dict | boolean | 是否使用默认主词典 | true |
8485
| ext_dict | String | 扩展词典文件名称,多个用分号隔开 | ext.dic; |
8586
| ext_stopwords | String | 停用词典文件名称,多个用分号隔开 | stopword.dic; |
@@ -100,6 +101,9 @@ ik-analyzer for solr 7.x-8.x
100101

101102

102103
## 更新说明
104+
- `2020-12-30:`
105+
- 升级lucene版本为`8.3.1`
106+
- 更新词库
103107
- `2019-11-12:`
104108
- 升级lucene版本为`8.3.0`
105109
- `IKAnalyzer.cfg.xml`增加配置项`use_main_dict`,用于配置是否启用默认主词典

pom.xml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
<groupId>com.github.magese</groupId>
66
<artifactId>ik-analyzer</artifactId>
7-
<version>8.3.0</version>
7+
<version>8.3.1</version>
88
<packaging>jar</packaging>
99

1010
<name>ik-analyzer-solr</name>
@@ -13,7 +13,7 @@
1313

1414
<properties>
1515
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
16-
<lucene.version>8.3.0</lucene.version>
16+
<lucene.version>8.3.1</lucene.version>
1717
<javac.src.version>1.8</javac.src.version>
1818
<javac.target.version>1.8</javac.target.version>
1919
<maven.compiler.plugin.version>3.3</maven.compiler.plugin.version>
@@ -152,4 +152,3 @@
152152
</profile>
153153
</profiles>
154154
</project>
155-

src/main/java/org/wltea/analyzer/cfg/Configuration.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* IK 中文分词 版本 8.3.0
3-
* IK Analyzer release 8.3.0
2+
* IK 中文分词 版本 8.3.1
3+
* IK Analyzer release 8.3.1
44
*
55
* Licensed to the Apache Software Foundation (ASF) under one or more
66
* contributor license agreements. See the NOTICE file distributed with
@@ -21,8 +21,8 @@
2121
* 版权声明 2012,乌龙茶工作室
2222
* provided by Linliangyi and copyright 2012 by Oolong studio
2323
*
24-
* 8.3.0版本 由 Magese (magese@live.cn) 更新
25-
* release 8.3.0 update by Magese(magese@live.cn)
24+
* 8.3.1版本 由 Magese (magese@live.cn) 更新
25+
* release 8.3.1 update by Magese(magese@live.cn)
2626
*
2727
*/
2828
package org.wltea.analyzer.cfg;

src/main/java/org/wltea/analyzer/cfg/DefaultConfig.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* IK 中文分词 版本 8.3.0
3-
* IK Analyzer release 8.3.0
2+
* IK 中文分词 版本 8.3.1
3+
* IK Analyzer release 8.3.1
44
*
55
* Licensed to the Apache Software Foundation (ASF) under one or more
66
* contributor license agreements. See the NOTICE file distributed with
@@ -21,8 +21,8 @@
2121
* 版权声明 2012,乌龙茶工作室
2222
* provided by Linliangyi and copyright 2012 by Oolong studio
2323
*
24-
* 8.3.0版本 由 Magese (magese@live.cn) 更新
25-
* release 8.3.0 update by Magese(magese@live.cn)
24+
* 8.3.1版本 由 Magese (magese@live.cn) 更新
25+
* release 8.3.1 update by Magese(magese@live.cn)
2626
*
2727
*/
2828
package org.wltea.analyzer.cfg;

src/main/java/org/wltea/analyzer/core/AnalyzeContext.java

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* IK 中文分词 版本 8.3.0
3-
* IK Analyzer release 8.3.0
2+
* IK 中文分词 版本 8.3.1
3+
* IK Analyzer release 8.3.1
44
*
55
* Licensed to the Apache Software Foundation (ASF) under one or more
66
* contributor license agreements. See the NOTICE file distributed with
@@ -21,23 +21,19 @@
2121
* 版权声明 2012,乌龙茶工作室
2222
* provided by Linliangyi and copyright 2012 by Oolong studio
2323
*
24-
* 8.3.0版本 由 Magese (magese@live.cn) 更新
25-
* release 8.3.0 update by Magese(magese@live.cn)
24+
* 8.3.1版本 由 Magese (magese@live.cn) 更新
25+
* release 8.3.1 update by Magese(magese@live.cn)
2626
*
2727
*/
2828
package org.wltea.analyzer.core;
2929

30-
import java.io.IOException;
31-
import java.io.Reader;
32-
import java.util.HashMap;
33-
import java.util.HashSet;
34-
import java.util.LinkedList;
35-
import java.util.Map;
36-
import java.util.Set;
37-
3830
import org.wltea.analyzer.cfg.Configuration;
3931
import org.wltea.analyzer.dic.Dictionary;
4032

33+
import java.io.IOException;
34+
import java.io.Reader;
35+
import java.util.*;
36+
4137
/**
4238
* 分词器上下文状态
4339
*/
@@ -66,17 +62,17 @@ class AnalyzeContext {
6662

6763
//子分词器锁
6864
//该集合非空,说明有子分词器在占用segmentBuff
69-
private Set<String> buffLocker;
65+
private final Set<String> buffLocker;
7066

7167
//原始分词结果集合,未经歧义处理
7268
private QuickSortSet orgLexemes;
7369
//LexemePath位置索引表
74-
private Map<Integer, LexemePath> pathMap;
70+
private final Map<Integer, LexemePath> pathMap;
7571
//最终分词结果集
76-
private LinkedList<Lexeme> results;
72+
private final LinkedList<Lexeme> results;
7773

7874
//分词器配置项
79-
private Configuration cfg;
75+
private final Configuration cfg;
8076

8177
AnalyzeContext(Configuration cfg) {
8278
this.cfg = cfg;
@@ -254,7 +250,7 @@ QuickSortSet getOrgLexemes() {
254250
*/
255251
void outputToResult() {
256252
int index = 0;
257-
for (; index <= this.cursor; ) {
253+
while (index <= this.cursor) {
258254
//跳过非CJK字符
259255
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
260256
index++;
@@ -353,12 +349,14 @@ private void compound(Lexeme result) {
353349
if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
354350
Lexeme nextLexeme = this.results.peekFirst();
355351
boolean appendOk = false;
356-
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
357-
//合并英文数词+中文数词
358-
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
359-
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
360-
//合并英文数词+中文量词
361-
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
352+
if (nextLexeme != null) {
353+
if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
354+
//合并英文数词+中文数词
355+
appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
356+
} else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
357+
//合并英文数词+中文量词
358+
appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
359+
}
362360
}
363361
if (appendOk) {
364362
//弹出

src/main/java/org/wltea/analyzer/core/CJKSegmenter.java

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* IK 中文分词 版本 8.3.0
3-
* IK Analyzer release 8.3.0
2+
* IK 中文分词 版本 8.3.1
3+
* IK Analyzer release 8.3.1
44
*
55
* Licensed to the Apache Software Foundation (ASF) under one or more
66
* contributor license agreements. See the NOTICE file distributed with
@@ -21,8 +21,8 @@
2121
* 版权声明 2012,乌龙茶工作室
2222
* provided by Linliangyi and copyright 2012 by Oolong studio
2323
*
24-
* 8.3.0版本 由 Magese (magese@live.cn) 更新
25-
* release 8.3.0 update by Magese(magese@live.cn)
24+
* 8.3.1版本 由 Magese (magese@live.cn) 更新
25+
* release 8.3.1 update by Magese(magese@live.cn)
2626
*
2727
*/
2828
package org.wltea.analyzer.core;
@@ -38,13 +38,13 @@
3838
* 中文-日韩文子分词器
3939
*/
4040
class CJKSegmenter implements ISegmenter {
41-
41+
4242
//子分词器标签
4343
private static final String SEGMENTER_NAME = "CJK_SEGMENTER";
4444
//待处理的分词hit队列
4545
private List<Hit> tmpHits;
46-
47-
46+
47+
4848
CJKSegmenter(){
4949
this.tmpHits = new LinkedList<>();
5050
}
@@ -54,7 +54,7 @@ class CJKSegmenter implements ISegmenter {
5454
*/
5555
public void analyze(AnalyzeContext context) {
5656
if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
57-
57+
5858
//优先处理tmpHits中的hit
5959
if(!this.tmpHits.isEmpty()){
6060
//处理词段队列
@@ -65,18 +65,18 @@ public void analyze(AnalyzeContext context) {
6565
//输出当前的词
6666
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
6767
context.addLexeme(newLexeme);
68-
68+
6969
if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
7070
this.tmpHits.remove(hit);
7171
}
72-
72+
7373
}else if(hit.isUnmatch()){
7474
//hit不是词,移除
7575
this.tmpHits.remove(hit);
76-
}
76+
}
7777
}
78-
}
79-
78+
}
79+
8080
//*********************************
8181
//再对当前指针位置的字符进行单字匹配
8282
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
@@ -94,24 +94,24 @@ public void analyze(AnalyzeContext context) {
9494
//前缀匹配则放入hit列表
9595
this.tmpHits.add(singleCharHit);
9696
}
97-
97+
9898

9999
}else{
100100
//遇到CHAR_USELESS字符
101101
//清空队列
102102
this.tmpHits.clear();
103103
}
104-
104+
105105
//判断缓冲区是否已经读完
106106
if(context.isBufferConsumed()){
107107
//清空队列
108108
this.tmpHits.clear();
109109
}
110-
110+
111111
//判断是否锁定缓冲区
112112
if(this.tmpHits.size() == 0){
113113
context.unlockBuffer(SEGMENTER_NAME);
114-
114+
115115
}else{
116116
context.lockBuffer(SEGMENTER_NAME);
117117
}

0 commit comments

Comments
 (0)