Skip to content

Commit dbaafee

Browse files
committed
fix: 修复知识库替换同步未对本地知识库进行覆盖
1 parent f19a4d9 commit dbaafee

File tree

3 files changed

+51
-6
lines changed

3 files changed

+51
-6
lines changed

apps/dataset/serializers/dataset_serializers.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
from dataset.serializers.common_serializers import list_paragraph, MetaSerializer, ProblemParagraphManage, \
3939
get_embedding_model_by_dataset_id, get_embedding_model_id_by_dataset_id
4040
from dataset.serializers.document_serializers import DocumentSerializers, DocumentInstanceSerializer
41-
from dataset.task import sync_web_dataset
41+
from dataset.task import sync_web_dataset, sync_replace_web_dataset
4242
from embedding.models import SearchMode
4343
from embedding.task import embedding_by_dataset, delete_embedding_by_dataset
4444
from setting.models import AuthOperate
@@ -602,15 +602,18 @@ def handler(child_link: ChildLink, response: Fork.Response):
602602
document_name = child_link.tag.text if child_link.tag is not None and len(
603603
child_link.tag.text.strip()) > 0 else child_link.url
604604
paragraphs = get_split_model('web.md').parse(response.content)
605-
first = QuerySet(Document).filter(meta__source_url=child_link.url, dataset=dataset).first()
605+
print(child_link.url.strip())
606+
first = QuerySet(Document).filter(meta__source_url=child_link.url.strip(),
607+
dataset=dataset).first()
606608
if first is not None:
607609
# 如果存在,使用文档同步
608610
DocumentSerializers.Sync(data={'document_id': first.id}).sync()
609611
else:
610612
# 插入
611613
DocumentSerializers.Create(data={'dataset_id': dataset.id}).save(
612614
{'name': document_name, 'paragraphs': paragraphs,
613-
'meta': {'source_url': child_link.url, 'selector': dataset.meta.get('selector')},
615+
'meta': {'source_url': child_link.url.strip(),
616+
'selector': dataset.meta.get('selector')},
614617
'type': Type.web}, with_valid=True)
615618
except Exception as e:
616619
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
@@ -624,7 +627,7 @@ def replace_sync(self, dataset):
624627
"""
625628
url = dataset.meta.get('source_url')
626629
selector = dataset.meta.get('selector') if 'selector' in dataset.meta else None
627-
sync_web_dataset.delay(str(dataset.id), url, selector)
630+
sync_replace_web_dataset.delay(str(dataset.id), url, selector)
628631

629632
def complete_sync(self, dataset):
630633
"""

apps/dataset/task/sync.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from celery_once import QueueOnce
1515

1616
from common.util.fork import ForkManage, Fork
17-
from dataset.task.tools import get_save_handler, get_sync_web_document_handler
17+
from dataset.task.tools import get_save_handler, get_sync_web_document_handler, get_sync_handler
1818

1919
from ops import celery_app
2020

@@ -34,6 +34,18 @@ def sync_web_dataset(dataset_id: str, url: str, selector: str):
3434
max_kb_error.error(f'同步web知识库:{dataset_id}出现错误{str(e)}{traceback.format_exc()}')
3535

3636

37+
@celery_app.task(base=QueueOnce, once={'keys': ['dataset_id']}, name='celery:sync_web_dataset')
38+
def sync_replace_web_dataset(dataset_id: str, url: str, selector: str):
39+
try:
40+
max_kb.info(f"开始--->开始同步web知识库:{dataset_id}")
41+
ForkManage(url, selector.split(" ") if selector is not None else []).fork(2, set(),
42+
get_sync_handler(dataset_id
43+
))
44+
max_kb.info(f"结束--->结束同步web知识库:{dataset_id}")
45+
except Exception as e:
46+
max_kb_error.error(f'同步web知识库:{dataset_id}出现错误{str(e)}{traceback.format_exc()}')
47+
48+
3749
@celery_app.task(name='celery:sync_web_document')
3850
def sync_web_document(dataset_id, source_url_list: List[str], selector: str):
3951
handler = get_sync_web_document_handler(dataset_id)

apps/dataset/task/tools.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
import re
1212
import traceback
1313

14+
from django.db.models import QuerySet
15+
1416
from common.util.fork import ChildLink, Fork
1517
from common.util.split_model import get_split_model
16-
from dataset.models import Type, Document, Status
18+
from dataset.models import Type, Document, DataSet, Status
1719

1820
max_kb_error = logging.getLogger("max_kb_error")
1921
max_kb = logging.getLogger("max_kb")
@@ -38,6 +40,34 @@ def handler(child_link: ChildLink, response: Fork.Response):
3840
return handler
3941

4042

43+
def get_sync_handler(dataset_id):
44+
from dataset.serializers.document_serializers import DocumentSerializers
45+
dataset = QuerySet(DataSet).filter(id=dataset_id).first()
46+
47+
def handler(child_link: ChildLink, response: Fork.Response):
48+
if response.status == 200:
49+
try:
50+
51+
document_name = child_link.tag.text if child_link.tag is not None and len(
52+
child_link.tag.text.strip()) > 0 else child_link.url
53+
paragraphs = get_split_model('web.md').parse(response.content)
54+
first = QuerySet(Document).filter(meta__source_url=child_link.url.strip(),
55+
dataset=dataset).first()
56+
if first is not None:
57+
# 如果存在,使用文档同步
58+
DocumentSerializers.Sync(data={'document_id': first.id}).sync()
59+
else:
60+
# 插入
61+
DocumentSerializers.Create(data={'dataset_id': dataset.id}).save(
62+
{'name': document_name, 'paragraphs': paragraphs,
63+
'meta': {'source_url': child_link.url.strip(), 'selector': dataset.meta.get('selector')},
64+
'type': Type.web}, with_valid=True)
65+
except Exception as e:
66+
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
67+
68+
return handler
69+
70+
4171
def get_sync_web_document_handler(dataset_id):
4272
from dataset.serializers.document_serializers import DocumentSerializers
4373

0 commit comments

Comments
 (0)