|
12 | 12 |
|
13 | 13 | import jieba |
14 | 14 | import jieba.posseg |
15 | | -from jieba import analyse |
16 | | - |
17 | | -from common.util.split_model import group_by |
18 | 15 |
|
19 | 16 | jieba_word_list_cache = [chr(item) for item in range(38, 84)] |
20 | 17 |
|
@@ -80,37 +77,12 @@ def get_key_by_word_dict(key, word_dict): |
80 | 77 |
|
81 | 78 |
|
82 | 79 | def to_ts_vector(text: str): |
83 | | - # 获取不分词的数据 |
84 | | - word_list = get_word_list(text) |
85 | | - # 获取关键词关系 |
86 | | - word_dict = to_word_dict(word_list, text) |
87 | | - # 替换字符串 |
88 | | - text = replace_word(word_dict, text) |
89 | 80 | # 分词 |
90 | | - filter_word = jieba.analyse.extract_tags(text, topK=100) |
91 | | - result = jieba.lcut(text, HMM=True, use_paddle=True) |
92 | | - # 过滤标点符号 |
93 | | - result = [item for item in result if filter_word.__contains__(item) and len(item) < 10] |
94 | | - result_ = [{'word': get_key_by_word_dict(result[index], word_dict), 'index': index} for index in |
95 | | - range(len(result))] |
96 | | - result_group = group_by(result_, lambda r: r['word']) |
97 | | - return " ".join( |
98 | | - [f"{key.lower()}:{','.join([str(item['index'] + 1) for item in result_group[key]][:20])}" for key in |
99 | | - result_group if |
100 | | - not remove_chars.__contains__(key) and len(key.strip()) >= 0]) |
| 81 | + result = jieba.lcut(text) |
| 82 | + return " ".join(result) |
101 | 83 |
|
102 | 84 |
|
103 | 85 | def to_query(text: str): |
104 | | - # 获取不分词的数据 |
105 | | - word_list = get_word_list(text) |
106 | | - # 获取关键词关系 |
107 | | - word_dict = to_word_dict(word_list, text) |
108 | | - # 替换字符串 |
109 | | - text = replace_word(word_dict, text) |
110 | | - extract_tags = analyse.extract_tags(text, topK=5, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v', 'eng')) |
111 | | - result = " ".join([get_key_by_word_dict(word, word_dict) for word, score in extract_tags if |
112 | | - not remove_chars.__contains__(word)]) |
113 | | - # 删除词库 |
114 | | - for word in word_list: |
115 | | - jieba.del_word(word) |
| 86 | + extract_tags = jieba.lcut(text) |
| 87 | + result = " ".join(extract_tags) |
116 | 88 | return result |
0 commit comments