|
| 1 | +import logging |
1 | 2 | import os |
| 3 | +import re |
| 4 | +import traceback |
2 | 5 | from functools import reduce |
3 | 6 | from typing import Dict |
4 | 7 |
|
5 | 8 | import uuid_utils.compat as uuid |
| 9 | +from celery_once import AlreadyQueued |
| 10 | +from django.core import validators |
6 | 11 | from django.db import transaction, models |
7 | 12 | from django.db.models import QuerySet |
| 13 | +from django.db.models.functions import Reverse, Substr |
8 | 14 | from django.utils.translation import gettext_lazy as _ |
9 | 15 | from rest_framework import serializers |
10 | 16 |
|
11 | 17 | from common.db.search import native_search, get_dynamics_model, native_page_search |
12 | 18 | from common.db.sql_execute import select_list |
| 19 | +from common.event import ListenerManagement |
13 | 20 | from common.exception.app_exception import AppApiException |
14 | 21 | from common.utils.common import valid_license, post, get_file_content |
| 22 | +from common.utils.fork import Fork, ChildLink |
| 23 | +from common.utils.split_model import get_split_model |
15 | 24 | from knowledge.models import Knowledge, KnowledgeScope, KnowledgeType, Document, Paragraph, Problem, \ |
16 | | - ProblemParagraphMapping, ApplicationKnowledgeMapping |
17 | | -from knowledge.serializers.common import ProblemParagraphManage, get_embedding_model_id_by_knowledge_id, MetaSerializer |
| 25 | + ProblemParagraphMapping, ApplicationKnowledgeMapping, TaskType, State |
| 26 | +from knowledge.serializers.common import ProblemParagraphManage, get_embedding_model_id_by_knowledge_id, MetaSerializer, \ |
| 27 | + GenerateRelatedSerializer |
18 | 28 | from knowledge.serializers.document import DocumentSerializers |
19 | 29 | from knowledge.task.embedding import embedding_by_knowledge, delete_embedding_by_knowledge |
20 | | -from knowledge.task.sync import sync_web_knowledge |
| 30 | +from knowledge.task.generate import generate_related_by_knowledge_id |
| 31 | +from knowledge.task.sync import sync_web_knowledge, sync_replace_web_knowledge |
21 | 32 | from maxkb.conf import PROJECT_DIR |
22 | 33 |
|
23 | 34 |
|
@@ -137,6 +148,35 @@ class Operate(serializers.Serializer): |
137 | 148 | workspace_id = serializers.CharField(required=True, label=_('workspace id')) |
138 | 149 | knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id')) |
139 | 150 |
|
| 151 | + def generate_related(self, instance: Dict, with_valid=True): |
| 152 | + if with_valid: |
| 153 | + self.is_valid(raise_exception=True) |
| 154 | + GenerateRelatedSerializer(data=instance).is_valid(raise_exception=True) |
| 155 | + knowledge_id = self.data.get('id') |
| 156 | + model_id = instance.get("model_id") |
| 157 | + prompt = instance.get("prompt") |
| 158 | + state_list = instance.get('state_list') |
| 159 | + ListenerManagement.update_status( |
| 160 | + QuerySet(Document).filter(knowledge_id=knowledge_id), |
| 161 | + TaskType.GENERATE_PROBLEM, |
| 162 | + State.PENDING |
| 163 | + ) |
| 164 | + ListenerManagement.update_status( |
| 165 | + QuerySet(Paragraph).annotate( |
| 166 | + reversed_status=Reverse('status'), |
| 167 | + task_type_status=Substr('reversed_status', TaskType.GENERATE_PROBLEM.value, 1), |
| 168 | + ).filter( |
| 169 | + task_type_status__in=state_list, knowledge_id=knowledge_id |
| 170 | + ).values('id'), |
| 171 | + TaskType.GENERATE_PROBLEM, |
| 172 | + State.PENDING |
| 173 | + ) |
| 174 | + ListenerManagement.get_aggregation_document_status_by_knowledge_id(knowledge_id)() |
| 175 | + try: |
| 176 | + generate_related_by_knowledge_id.delay(knowledge_id, model_id, prompt, state_list) |
| 177 | + except AlreadyQueued as e: |
| 178 | + raise AppApiException(500, _('Failed to send the vectorization task, please try again later!')) |
| 179 | + |
140 | 180 | def list_application(self, with_valid=True): |
141 | 181 | if with_valid: |
142 | 182 | self.is_valid(raise_exception=True) |
@@ -340,3 +380,80 @@ def save_web(self, instance: Dict, with_valid=True): |
340 | 380 | knowledge.save() |
341 | 381 | sync_web_knowledge.delay(str(knowledge_id), instance.get('source_url'), instance.get('selector')) |
342 | 382 | return {**KnowledgeModelSerializer(knowledge).data, 'document_list': []} |
| 383 | + |
| 384 | + class SyncWeb(serializers.Serializer): |
| 385 | + id = serializers.CharField(required=True, label=_('knowledge id')) |
| 386 | + user_id = serializers.UUIDField(required=False, label=_('user id')) |
| 387 | + sync_type = serializers.CharField(required=True, label=_('sync type'), validators=[ |
| 388 | + validators.RegexValidator(regex=re.compile("^replace|complete$"), |
| 389 | + message=_('The synchronization type only supports:replace|complete'), code=500)]) |
| 390 | + |
| 391 | + def is_valid(self, *, raise_exception=False): |
| 392 | + super().is_valid(raise_exception=True) |
| 393 | + first = QuerySet(Knowledge).filter(id=self.data.get("id")).first() |
| 394 | + if first is None: |
| 395 | + raise AppApiException(300, _('id does not exist')) |
| 396 | + if first.type != KnowledgeType.WEB: |
| 397 | + raise AppApiException(500, _('Synchronization is only supported for web site types')) |
| 398 | + |
| 399 | + def sync(self, with_valid=True): |
| 400 | + if with_valid: |
| 401 | + self.is_valid(raise_exception=True) |
| 402 | + sync_type = self.data.get('sync_type') |
| 403 | + knowledge_id = self.data.get('id') |
| 404 | + knowledge = QuerySet(Knowledge).get(id=knowledge_id) |
| 405 | + self.__getattribute__(sync_type + '_sync')(knowledge) |
| 406 | + return True |
| 407 | + |
| 408 | + @staticmethod |
| 409 | + def get_sync_handler(knowledge): |
| 410 | + def handler(child_link: ChildLink, response: Fork.Response): |
| 411 | + if response.status == 200: |
| 412 | + try: |
| 413 | + document_name = child_link.tag.text if child_link.tag is not None and len( |
| 414 | + child_link.tag.text.strip()) > 0 else child_link.url |
| 415 | + paragraphs = get_split_model('web.md').parse(response.content) |
| 416 | + print(child_link.url.strip()) |
| 417 | + first = QuerySet(Document).filter( |
| 418 | + meta__source_url=child_link.url.strip(), |
| 419 | + knowledge=knowledge |
| 420 | + ).first() |
| 421 | + if first is not None: |
| 422 | + # 如果存在,使用文档同步 |
| 423 | + DocumentSerializers.Sync(data={'document_id': first.id}).sync() |
| 424 | + else: |
| 425 | + # 插入 |
| 426 | + DocumentSerializers.Create(data={'knowledge_id': knowledge.id}).save( |
| 427 | + {'name': document_name, 'paragraphs': paragraphs, |
| 428 | + 'meta': {'source_url': child_link.url.strip(), |
| 429 | + 'selector': knowledge.meta.get('selector')}, |
| 430 | + 'type': Knowledge.WEB}, with_valid=True) |
| 431 | + except Exception as e: |
| 432 | + logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}') |
| 433 | + |
| 434 | + return handler |
| 435 | + |
| 436 | + def replace_sync(self, knowledge): |
| 437 | + """ |
| 438 | + 替换同步 |
| 439 | + :return: |
| 440 | + """ |
| 441 | + url = knowledge.meta.get('source_url') |
| 442 | + selector = knowledge.meta.get('selector') if 'selector' in knowledge.meta else None |
| 443 | + sync_replace_web_knowledge.delay(str(knowledge.id), url, selector) |
| 444 | + |
| 445 | + def complete_sync(self, knowledge): |
| 446 | + """ |
| 447 | + 完整同步 删掉当前数据集下所有的文档,再进行同步 |
| 448 | + :return: |
| 449 | + """ |
| 450 | + # 删除关联问题 |
| 451 | + QuerySet(ProblemParagraphMapping).filter(knowledge=knowledge).delete() |
| 452 | + # 删除文档 |
| 453 | + QuerySet(Document).filter(knowledge=knowledge).delete() |
| 454 | + # 删除段落 |
| 455 | + QuerySet(Paragraph).filter(knowledge=knowledge).delete() |
| 456 | + # 删除向量 |
| 457 | + delete_embedding_by_knowledge(self.data.get('id')) |
| 458 | + # 同步 |
| 459 | + self.replace_sync(knowledge) |
0 commit comments