Skip to content

Commit 2991f0b

Browse files
authored
perf: Optimize word segmentation retrieval (#2767)
1 parent 6fde8ec commit 2991f0b

File tree

2 files changed

+9
-34
lines changed

2 files changed

+9
-34
lines changed

apps/common/util/ts_vecto_util.py

Lines changed: 4 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@
1212

1313
import jieba
1414
import jieba.posseg
15-
from jieba import analyse
16-
17-
from common.util.split_model import group_by
1815

1916
jieba_word_list_cache = [chr(item) for item in range(38, 84)]
2017

@@ -80,37 +77,12 @@ def get_key_by_word_dict(key, word_dict):
8077

8178

8279
def to_ts_vector(text: str):
83-
# 获取不分词的数据
84-
word_list = get_word_list(text)
85-
# 获取关键词关系
86-
word_dict = to_word_dict(word_list, text)
87-
# 替换字符串
88-
text = replace_word(word_dict, text)
8980
# 分词
90-
filter_word = jieba.analyse.extract_tags(text, topK=100)
91-
result = jieba.lcut(text, HMM=True, use_paddle=True)
92-
# 过滤标点符号
93-
result = [item for item in result if filter_word.__contains__(item) and len(item) < 10]
94-
result_ = [{'word': get_key_by_word_dict(result[index], word_dict), 'index': index} for index in
95-
range(len(result))]
96-
result_group = group_by(result_, lambda r: r['word'])
97-
return " ".join(
98-
[f"{key.lower()}:{','.join([str(item['index'] + 1) for item in result_group[key]][:20])}" for key in
99-
result_group if
100-
not remove_chars.__contains__(key) and len(key.strip()) >= 0])
81+
result = jieba.lcut(text)
82+
return " ".join(result)
10183

10284

10385
def to_query(text: str):
104-
# 获取不分词的数据
105-
word_list = get_word_list(text)
106-
# 获取关键词关系
107-
word_dict = to_word_dict(word_list, text)
108-
# 替换字符串
109-
text = replace_word(word_dict, text)
110-
extract_tags = analyse.extract_tags(text, topK=5, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v', 'eng'))
111-
result = " ".join([get_key_by_word_dict(word, word_dict) for word, score in extract_tags if
112-
not remove_chars.__contains__(word)])
113-
# 删除词库
114-
for word in word_list:
115-
jieba.del_word(word)
86+
extract_tags = jieba.lcut(text)
87+
result = " ".join(extract_tags)
11688
return result

apps/embedding/vector/pg_vector.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
from abc import ABC, abstractmethod
1313
from typing import Dict, List
1414

15-
from django.db.models import QuerySet
15+
import jieba
16+
from django.contrib.postgres.search import SearchVector
17+
from django.db.models import QuerySet, Value
1618
from langchain_core.embeddings import Embeddings
1719

1820
from common.db.search import generate_sql_by_query_dict
@@ -68,7 +70,8 @@ def _batch_save(self, text_list: List[Dict], embedding: Embeddings, is_the_task_
6870
source_id=text_list[index].get('source_id'),
6971
source_type=text_list[index].get('source_type'),
7072
embedding=embeddings[index],
71-
search_vector=to_ts_vector(text_list[index]['text'])) for index in
73+
search_vector=SearchVector(Value(to_ts_vector(text_list[index]['text'])))) for
74+
index in
7275
range(0, len(texts))]
7376
if not is_the_task_interrupted():
7477
QuerySet(Embedding).bulk_create(embedding_list) if len(embedding_list) > 0 else None

0 commit comments

Comments
 (0)