|
10 | 10 | import org.ansj.domain.NewWord; |
11 | 11 | import org.ansj.domain.TermNatures; |
12 | 12 | import org.ansj.recognition.AsianPersonRecognition; |
13 | | -import org.ansj.recognition.CompanyRecogntion; |
14 | 13 | import org.ansj.recognition.ForeignPersonRecognition; |
15 | 14 | import org.ansj.util.Graph; |
16 | 15 |
|
|
22 | 21 | */ |
23 | 22 | public class LearnTool { |
24 | 23 |
|
25 | | - /** |
26 | | - * 是否开启学习机 |
27 | | - */ |
28 | | - public boolean isCompany = true; |
29 | | - |
30 | | - public boolean isAsianName = true; |
31 | | - |
32 | | - public boolean isForeignName = true; |
33 | | - |
34 | | - /** |
35 | | - * 告诉大家你学习了多少个词了 |
36 | | - */ |
37 | | - public int count; |
38 | | - |
39 | | - /** |
40 | | - * 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做. |
41 | | - */ |
42 | | - private final SmartForest<NewWord> sf = new SmartForest<NewWord>(); |
43 | | - |
44 | | - /** |
45 | | - * 公司名称学习. |
46 | | - * |
47 | | - * @param graph |
48 | | - */ |
49 | | - public void learn(Graph graph) { |
50 | | - |
51 | | - // 亚洲人名识别 |
52 | | - if (isAsianName) { |
53 | | - findAsianPerson(graph); |
54 | | - } |
55 | | - |
56 | | - // 外国人名识别 |
57 | | - if (isForeignName) { |
58 | | - findForeignPerson(graph); |
59 | | - } |
60 | | - |
61 | | - } |
62 | | - |
63 | | - private void findAsianPerson(Graph graph) { |
64 | | - List<NewWord> newWords = new AsianPersonRecognition(graph.terms).getNewWords(); |
65 | | - addListToTerm(newWords); |
66 | | - } |
67 | | - |
68 | | - private void findForeignPerson(Graph graph) { |
69 | | - List<NewWord> newWords = new ForeignPersonRecognition(graph.terms).getNewWords(); |
70 | | - addListToTerm(newWords); |
71 | | - } |
72 | | - |
73 | | - /** |
74 | | - * 公司名称查找 |
75 | | - * |
76 | | - * @param graph |
77 | | - */ |
78 | | - private void findCompany(Graph graph) { |
79 | | - List<NewWord> newWords = new CompanyRecogntion(graph.terms).getNewWords(); |
80 | | - addListToTerm(newWords); |
81 | | - } |
82 | | - |
83 | | - // 批量将新词加入到词典中 |
84 | | - private void addListToTerm(List<NewWord> newWords) { |
85 | | - if (newWords.size() == 0) |
86 | | - return; |
87 | | - |
88 | | - for (NewWord newWord : newWords) { |
89 | | - newWord.setScore(-1); |
90 | | - addTerm(newWord); |
91 | | - } |
92 | | - } |
93 | | - |
94 | | - |
95 | | - /** |
96 | | - * 增加一个新词到树中 |
97 | | - * |
98 | | - * @param newWord |
99 | | - */ |
100 | | - public void addTerm(NewWord newWord) { |
101 | | - NewWord temp = null; |
102 | | - SmartForest<NewWord> smartForest = null; |
103 | | - if ((smartForest = sf.getBranch(newWord.getName())) != null |
104 | | - && smartForest.getParam() != null) { |
105 | | - temp = smartForest.getParam(); |
106 | | - temp.update(newWord.getScore(), newWord.getNature(), newWord.getAllFreq()); |
107 | | - } else { |
108 | | - count++; |
109 | | - // 设置名字为空,节省内存空间 |
110 | | - synchronized (sf) { |
111 | | - sf.add(newWord.getName(), newWord); |
112 | | - } |
113 | | - } |
114 | | - } |
115 | | - |
116 | | - public SmartForest<NewWord> getForest() { |
117 | | - return this.sf; |
118 | | - } |
119 | | - |
120 | | - /** |
121 | | - * 返回学习到的新词. |
122 | | - * |
123 | | - * @param num |
124 | | - * 返回数目.0为全部返回 |
125 | | - * @return |
126 | | - */ |
127 | | - public List<Entry<String, Double>> getTopTree(int num) { |
128 | | - return getTopTree(num, null); |
129 | | - } |
130 | | - |
131 | | - public List<Entry<String, Double>> getTopTree(int num, TermNatures nature) { |
132 | | - if (sf.branches == null) { |
133 | | - return null; |
134 | | - } |
135 | | - HashMap<String, Double> hm = new HashMap<String, Double>(); |
136 | | - for (int i = 0; i < sf.branches.length; i++) { |
137 | | - valueResult(sf.branches[i], hm, nature); |
138 | | - } |
139 | | - List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1); |
140 | | - if (num == 0) { |
141 | | - return sortMapByValue; |
142 | | - } else { |
143 | | - num = Math.min(num, sortMapByValue.size()); |
144 | | - return sortMapByValue.subList(0, num); |
145 | | - } |
146 | | - } |
147 | | - |
148 | | - private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm, |
149 | | - TermNatures nature) { |
150 | | - // TODO Auto-generated method stub |
151 | | - if (smartForest == null || smartForest.branches==null) { |
152 | | - return ; |
153 | | - } |
154 | | - for (int i = 0; i < smartForest.branches.length; i++) { |
155 | | - NewWord param = smartForest.branches[i].getParam(); |
156 | | - if (smartForest.branches[i].getStatus() == 3) { |
157 | | - if (nature == null || param.getNature().equals(nature)) { |
158 | | - hm.put(param.getName(), param.getScore()); |
159 | | - } |
160 | | - } else if (smartForest.branches[i].getStatus() == 2) { |
161 | | - if (nature == null || param.getNature().equals(nature)) { |
162 | | - hm.put(param.getName(), param.getScore()); |
163 | | - } |
164 | | - valueResult(smartForest.branches[i], hm, nature); |
165 | | - } else { |
166 | | - valueResult(smartForest.branches[i], hm, nature); |
167 | | - } |
168 | | - } |
169 | | - } |
| 24 | + /** |
| 25 | + * 是否开启学习机 |
| 26 | + */ |
| 27 | + public boolean isAsianName = true; |
| 28 | + |
| 29 | + public boolean isForeignName = true; |
| 30 | + |
| 31 | + /** |
| 32 | + * 告诉大家你学习了多少个词了 |
| 33 | + */ |
| 34 | + public int count; |
| 35 | + |
| 36 | + /** |
| 37 | + * 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做. |
| 38 | + */ |
| 39 | + private final SmartForest<NewWord> sf = new SmartForest<NewWord>(); |
| 40 | + |
| 41 | + /** |
| 42 | + * 公司名称学习. |
| 43 | + * |
| 44 | + * @param graph |
| 45 | + */ |
| 46 | + public void learn(Graph graph) { |
| 47 | + |
| 48 | + // 亚洲人名识别 |
| 49 | + if (isAsianName) { |
| 50 | + findAsianPerson(graph); |
| 51 | + } |
| 52 | + |
| 53 | + // 外国人名识别 |
| 54 | + if (isForeignName) { |
| 55 | + findForeignPerson(graph); |
| 56 | + } |
| 57 | + |
| 58 | + } |
| 59 | + |
| 60 | + private void findAsianPerson(Graph graph) { |
| 61 | + List<NewWord> newWords = new AsianPersonRecognition(graph.terms).getNewWords(); |
| 62 | + addListToTerm(newWords); |
| 63 | + } |
| 64 | + |
| 65 | + private void findForeignPerson(Graph graph) { |
| 66 | + List<NewWord> newWords = new ForeignPersonRecognition(graph.terms).getNewWords(); |
| 67 | + addListToTerm(newWords); |
| 68 | + } |
| 69 | + |
| 70 | + // 批量将新词加入到词典中 |
| 71 | + private void addListToTerm(List<NewWord> newWords) { |
| 72 | + if (newWords.size() == 0) |
| 73 | + return; |
| 74 | + |
| 75 | + for (NewWord newWord : newWords) { |
| 76 | + newWord.setScore(-1); |
| 77 | + addTerm(newWord); |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + /** |
| 82 | + * 增加一个新词到树中 |
| 83 | + * |
| 84 | + * @param newWord |
| 85 | + */ |
| 86 | + public void addTerm(NewWord newWord) { |
| 87 | + NewWord temp = null; |
| 88 | + SmartForest<NewWord> smartForest = null; |
| 89 | + if ((smartForest = sf.getBranch(newWord.getName())) != null && smartForest.getParam() != null) { |
| 90 | + temp = smartForest.getParam(); |
| 91 | + temp.update(newWord.getScore(), newWord.getNature(), newWord.getAllFreq()); |
| 92 | + } else { |
| 93 | + count++; |
| 94 | + // 设置名字为空,节省内存空间 |
| 95 | + synchronized (sf) { |
| 96 | + sf.add(newWord.getName(), newWord); |
| 97 | + } |
| 98 | + } |
| 99 | + } |
| 100 | + |
| 101 | + public SmartForest<NewWord> getForest() { |
| 102 | + return this.sf; |
| 103 | + } |
| 104 | + |
| 105 | + /** |
| 106 | + * 返回学习到的新词. |
| 107 | + * |
| 108 | + * @param num |
| 109 | + * 返回数目.0为全部返回 |
| 110 | + * @return |
| 111 | + */ |
| 112 | + public List<Entry<String, Double>> getTopTree(int num) { |
| 113 | + return getTopTree(num, null); |
| 114 | + } |
| 115 | + |
| 116 | + public List<Entry<String, Double>> getTopTree(int num, TermNatures nature) { |
| 117 | + if (sf.branches == null) { |
| 118 | + return null; |
| 119 | + } |
| 120 | + HashMap<String, Double> hm = new HashMap<String, Double>(); |
| 121 | + for (int i = 0; i < sf.branches.length; i++) { |
| 122 | + valueResult(sf.branches[i], hm, nature); |
| 123 | + } |
| 124 | + List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1); |
| 125 | + if (num == 0) { |
| 126 | + return sortMapByValue; |
| 127 | + } else { |
| 128 | + num = Math.min(num, sortMapByValue.size()); |
| 129 | + return sortMapByValue.subList(0, num); |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm, TermNatures nature) { |
| 134 | + // TODO Auto-generated method stub |
| 135 | + if (smartForest == null || smartForest.branches == null) { |
| 136 | + return; |
| 137 | + } |
| 138 | + for (int i = 0; i < smartForest.branches.length; i++) { |
| 139 | + NewWord param = smartForest.branches[i].getParam(); |
| 140 | + if (smartForest.branches[i].getStatus() == 3) { |
| 141 | + if (nature == null || param.getNature().equals(nature)) { |
| 142 | + hm.put(param.getName(), param.getScore()); |
| 143 | + } |
| 144 | + } else if (smartForest.branches[i].getStatus() == 2) { |
| 145 | + if (nature == null || param.getNature().equals(nature)) { |
| 146 | + hm.put(param.getName(), param.getScore()); |
| 147 | + } |
| 148 | + valueResult(smartForest.branches[i], hm, nature); |
| 149 | + } else { |
| 150 | + valueResult(smartForest.branches[i], hm, nature); |
| 151 | + } |
| 152 | + } |
| 153 | + } |
170 | 154 | } |
0 commit comments