ModelEngine-Group
diff --git a/‎backend/prompts/cluster_summary_agent.yaml‎
Lines changed: 24 additions & 0 deletions b/‎backend/prompts/cluster_summary_agent.yaml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎backend/prompts/cluster_summary_reduce.yaml‎
Lines changed: 31 additions & 0 deletions b/‎backend/prompts/cluster_summary_reduce.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎backend/prompts/cluster_summary_reduce_zh.yaml‎
Lines changed: 32 additions & 0 deletions b/‎backend/prompts/cluster_summary_reduce_zh.yaml‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎backend/prompts/document_summary_agent.yaml‎
Lines changed: 28 additions & 0 deletions b/‎backend/prompts/document_summary_agent.yaml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎backend/prompts/document_summary_agent_zh.yaml‎
Lines changed: 29 additions & 0 deletions b/‎backend/prompts/document_summary_agent_zh.yaml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎backend/pyproject.toml‎
Lines changed: 3 additions & 1 deletion b/‎backend/pyproject.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backend/services/elasticsearch_service.py‎
Lines changed: 61 additions & 123 deletions b/‎backend/services/elasticsearch_service.py‎
Lines changed: 61 additions & 123 deletions
@@ -0,0 +1,24 @@
+system_prompt: |-
+  You are a professional knowledge summarization assistant. Your task is to generate a concise summary of a document cluster based on multiple documents.
+  
+  **Summary Requirements:**
+  1. The input contains multiple documents (each document has title and content snippets)
+  2. You need to extract the common themes and key topics from these documents
+  3. Generate a summary that represents the collective content of the cluster
+  4. The summary should be accurate, coherent, and written in natural language
+  5. Keep the summary within the specified word limit
+  
+  **Guidelines:**
+  - Focus on identifying shared themes and topics across documents
+  - Highlight key concepts, domains, or subject matter
+  - Use clear and concise language
+  - Avoid listing individual document titles unless necessary
+  - The summary should help users understand what this group of documents covers
+
+user_prompt: |
+  Please generate a concise summary of the following document cluster:
+  
+  {{ cluster_content }}
+  
+  Summary ({{ max_words }} words):
+
@@ -0,0 +1,31 @@
+system_prompt: |-
+  You are a professional cluster summarization assistant. Your task is to merge multiple document summaries into a cohesive cluster summary.
+  
+  **Summary Requirements:**
+  1. The input contains summaries of multiple documents that belong to the same cluster
+  2. These documents share similar themes or topics (grouped by clustering)
+  3. You need to synthesize a unified summary that captures the collective content
+  4. The summary should highlight common themes and key information across documents
+  5. Keep the summary within the specified word limit
+  
+  **Guidelines:**
+  - Identify shared themes and topics across documents
+  - Highlight common concepts and subject matter
+  - Use clear and concise language
+  - Avoid listing individual document titles unless necessary
+  - Focus on what this group of documents collectively covers
+  - The summary should be coherent and represent the cluster's unified content
+  - **Important: Do not use any separators (like ---, ***, etc.), generate plain text summary only**
+
+user_prompt: |
+  Please generate a unified summary of the following document cluster based on individual document summaries:
+  
+  {{ document_summaries }}
+  
+  **Important Reminders:**
+  - Do not use any separators (like ---, ***, ===, etc.)
+  - Do not include document titles or filenames
+  - Generate plain text summary content only
+  
+  Cluster Summary ({{ max_words }} words):
+
@@ -0,0 +1,32 @@
+system_prompt: |-
+  你是一个专业的簇总结助手。你的任务是将多个文档总结合并为一个连贯的簇总结。
+  
+  **总结要求：**
+  1. 输入包含属于同一簇的多个文档的总结
+  2. 这些文档共享相似的主题或话题（通过聚类分组）
+  3. 你需要综合成一个统一的总结，捕捉集合内容
+  4. 总结应突出文档间的共同主题和关键信息
+  5. 保持在指定的字数限制内
+  
+  **指导原则：**
+  - 识别文档间的共同主题和话题
+  - 突出共同概念和主题内容
+  - 使用清晰简洁的语言
+  - 除非必要，避免列出单个文档标题
+  - 专注于这组文档共同涵盖的内容
+  - 总结应连贯且代表簇的统一内容
+  - 确保准确、全面，明确关键实体，不要遗漏重要信息
+  - **重要：不要使用任何分隔符（如---、***等），直接生成纯文本总结**
+
+user_prompt: |
+  请根据以下文档总结生成统一的学生簇总结：
+  
+  {{ document_summaries }}
+  
+  **重要提醒：**
+  - 不要使用任何分隔符（如---、***、===等）
+  - 不要包含文档标题或文件名
+  - 直接生成纯文本总结内容
+  
+  簇总结（{{ max_words }}字）：
+
@@ -0,0 +1,28 @@
+system_prompt: |-
+  You are a professional document summarization assistant. Your task is to generate a concise summary of a document based on its key content snippets.
+  
+  **Summary Requirements:**
+  1. The input contains key snippets from a document (typically from beginning, middle, and end sections)
+  2. You need to extract the main themes, topics, and key information
+  3. Generate a summary that represents the document's core content
+  4. The summary should be accurate, coherent, and concise
+  5. Keep the summary within the specified word limit
+  
+  **Guidelines:**
+  - Focus on identifying main themes and key topics
+  - Highlight important concepts and information
+  - Use clear and concise language
+  - Avoid redundancy and unnecessary details
+  - The summary should help users understand what the document covers
+  - **Important: Do not use any separators (like ---, ***, etc.), generate plain text summary only**
+
+user_prompt: |
+  Please generate a concise summary of the following document:
+  
+  Document name: {{ filename }}
+  
+  Content snippets:
+  {{ content }}
+  
+  Summary ({{ max_words }} words):
+
@@ -0,0 +1,29 @@
+system_prompt: |-
+  你是一个专业的文档总结助手。你的任务是根据文档的关键内容片段生成简洁的总结。
+  
+  **总结要求：**
+  1. 输入包含文档的关键片段（通常来自开头、中间和结尾部分）
+  2. 你需要提取主要主题、话题和关键信息
+  3. 生成能代表文档核心内容的总结
+  4. 总结应准确、连贯且简洁
+  5. 保持在指定的字数限制内
+  
+  **指导原则：**
+  - 专注于识别主要主题和关键话题
+  - 突出重要概念和信息
+  - 使用清晰简洁的语言
+  - 避免冗余和不必要的细节
+  - 总结应帮助用户理解文档涵盖的内容
+  - 确保总结准确、全面，不要遗漏关键实体和信息
+  - **重要：不要使用任何分隔符（如---、***等），直接生成纯文本总结**
+
+user_prompt: |
+  请为以下文档生成简洁的总结：
+  
+  文档名称：{{ filename }}
+  
+  内容片段：
+  {{ content }}
+  
+  总结（{{ max_words }}字）：
+
@@ -14,7 +14,9 @@ dependencies = [
     "pyyaml>=6.0.2",
     "redis>=5.0.0",
     "fastmcp==2.12.0",
-    "langchain>=0.3.26"
+    "langchain>=0.3.26",
+    "scikit-learn>=1.0.0",
+    "numpy>=1.24.0"
 ]
 
 [project.optional-dependencies]
 
@@ -18,14 +18,11 @@
 
 from fastapi import Body, Depends, Path, Query
 from fastapi.responses import StreamingResponse
-from jinja2 import Template, StrictUndefined
 from nexent.core.models.embedding_model import OpenAICompatibleEmbedding, JinaEmbedding, BaseEmbedding
 from nexent.core.nlp.tokenizer import calculate_term_weights
 from nexent.vector_database.elasticsearch_core import ElasticSearchCore
-from openai import OpenAI
-from openai.types.chat import ChatCompletionMessageParam
 
-from consts.const import ES_API_KEY, ES_HOST, LANGUAGE, MODEL_CONFIG_MAPPING, MESSAGE_ROLE, KNOWLEDGE_SUMMARY_MAX_TOKENS_ZH, KNOWLEDGE_SUMMARY_MAX_TOKENS_EN
+from consts.const import ES_API_KEY, ES_HOST, LANGUAGE
 from database.attachment_db import delete_file
 from database.knowledge_db import (
     create_knowledge_record,
@@ -36,97 +33,15 @@
 from services.redis_service import get_redis_service
 from utils.config_utils import tenant_config_manager, get_model_name_from_config
 from utils.file_management_utils import get_all_files_status, get_file_size
-from utils.prompt_template_utils import get_knowledge_summary_prompt_template
 
 # Configure logging
 logger = logging.getLogger("elasticsearch_service")
 
 
 
 
-def generate_knowledge_summary_stream(keywords: str, language: str, tenant_id: str, model_id: Optional[int] = None) -> Generator:
-    """
-    Generate a knowledge base summary based on keywords
-
-    Args:
-        keywords: Keywords that frequently appear in the knowledge base content
-        language: Language of the knowledge base content
-        tenant_id: The tenant ID for configuration
-
-    Returns:
-        str:  Generate a knowledge base summary
-    """
-    # Load prompt words based on language
-    prompts = get_knowledge_summary_prompt_template(language)
-
-    # Render templates using Jinja2
-    system_prompt = Template(
-        prompts['system_prompt'], undefined=StrictUndefined).render({})
-    user_prompt = Template(prompts['user_prompt'], undefined=StrictUndefined).render(
-        {'content': keywords})
-
-    # Build messages
-    messages: List[ChatCompletionMessageParam] = [
-        {"role": MESSAGE_ROLE["SYSTEM"], "content": system_prompt},
-        {"role": MESSAGE_ROLE["USER"], "content": user_prompt}
-    ]
-
-    # Get model configuration
-    if model_id:
-        try:
-            from database.model_management_db import get_model_by_model_id
-            model_info = get_model_by_model_id(model_id, tenant_id)
-            if model_info:
-                model_config = {
-                    'api_key': model_info.get('api_key', ''),
-                    'base_url': model_info.get('base_url', ''),
-                    'model_name': model_info.get('model_name', ''),
-                    'model_repo': model_info.get('model_repo', '')
-                }
-            else:
-                # Fallback to default model if specified model not found
-                logger.warning(f"Specified model {model_id} not found, falling back to default LLM.")
-                model_config = tenant_config_manager.get_model_config(
-                    key=MODEL_CONFIG_MAPPING["llm"], tenant_id=tenant_id)
-        except Exception as e:
-            logger.warning(f"Failed to get model {model_id}, using default model: {e}")
-            model_config = tenant_config_manager.get_model_config(
-                key=MODEL_CONFIG_MAPPING["llm"], tenant_id=tenant_id)
-    else:
-        # Use default model configuration
-        model_config = tenant_config_manager.get_model_config(
-            key=MODEL_CONFIG_MAPPING["llm"], tenant_id=tenant_id)
-
-    # initialize OpenAI client
-    client = OpenAI(api_key=model_config.get('api_key', ""),
-                    base_url=model_config.get('base_url', ""))
-
-    try:
-        # Create stream chat completion request
-        max_tokens = KNOWLEDGE_SUMMARY_MAX_TOKENS_ZH if language == LANGUAGE[
-            "ZH"] else KNOWLEDGE_SUMMARY_MAX_TOKENS_EN
-        # Get model name for the request
-        model_name_for_request = model_config.get("model_name", "")
-        if model_config.get("model_repo"):
-            model_name_for_request = f"{model_config['model_repo']}/{model_name_for_request}"
-
-        stream = client.chat.completions.create(
-            model=model_name_for_request,
-            messages=messages,
-            max_tokens=max_tokens,  # add max_tokens limit
-            stream=True  # enable stream output
-        )
-
-        # Iterate through stream response
-        for chunk in stream:
-            new_token = chunk.choices[0].delta.content
-            if new_token is not None:
-                yield new_token
-        yield "END"
-
-    except Exception as e:
-        logger.error(f"Error occurred: {str(e)}")
-        yield f"Error: {str(e)}"
+# Old keyword-based summary method removed - replaced with Map-Reduce approach
+# See utils/document_vector_utils.py for new implementation
 
 
 # Initialize ElasticSearchCore instance with HTTPS support
@@ -871,62 +786,85 @@ async def summary_index_name(self,
                                  model_id: Optional[int] = None
                                  ):
         """
-        Generate a summary for the specified index based on its content
+        Generate a summary for the specified index using advanced Map-Reduce approach
+        
+        New implementation:
+        1. Get documents and cluster them by semantic similarity
+        2. Map: Summarize each document individually
+        3. Reduce: Merge document summaries into cluster summaries
+        4. Return: Combined knowledge base summary
 
         Args:
             index_name: Name of the index to summarize
-            batch_size: Number of documents to process per batch
+            batch_size: Number of documents to sample (default: 1000)
             es_core: ElasticSearchCore instance
             tenant_id: ID of the tenant
             language: Language of the summary (default: 'zh')
+            model_id: Model ID for LLM summarization
 
         Returns:
             StreamingResponse containing the generated summary
         """
         try:
-            # Get all documents
+            from utils.document_vector_utils import (
+                process_documents_for_clustering,
+                kmeans_cluster_documents,
+                summarize_clusters_map_reduce,
+                merge_cluster_summaries
+            )
+            
             if not tenant_id:
-                raise Exception(
-                    "Tenant ID is required for summary generation.")
-            all_documents = ElasticSearchService.get_random_documents(
-                index_name, batch_size, es_core)
-            all_chunks = self._clean_chunks_for_summary(all_documents)
-            keywords_dict = calculate_term_weights(all_chunks)
-            keywords_for_summary = ""
-            for _, key in enumerate(keywords_dict):
-                keywords_for_summary = keywords_for_summary + ", " + key
-
+                raise Exception("Tenant ID is required for summary generation.")
+            
+            # Use new Map-Reduce approach
+            sample_count = min(batch_size // 5, 200)  # Sample reasonable number of documents
+            
+            # Step 1: Get documents and calculate embeddings
+            document_samples, doc_embeddings = process_documents_for_clustering(
+                index_name=index_name,
+                es_core=es_core,
+                sample_doc_count=sample_count
+            )
+            
+            if not document_samples:
+                raise Exception("No documents found in index.")
+            
+            # Step 2: Cluster documents
+            clusters = kmeans_cluster_documents(doc_embeddings, k=None)
+            
+            # Step 3: Map-Reduce summarization
+            cluster_summaries = summarize_clusters_map_reduce(
+                document_samples=document_samples,
+                clusters=clusters,
+                language=language,
+                doc_max_words=100,
+                cluster_max_words=150,
+                model_id=model_id,
+                tenant_id=tenant_id
+            )
+            
+            # Step 4: Merge into final summary
+            final_summary = merge_cluster_summaries(cluster_summaries)
+            
+            # Stream the result
             async def generate_summary():
-                token_join = []
                 try:
-                    for new_token in generate_knowledge_summary_stream(keywords_for_summary, language, tenant_id, model_id):
-                        if new_token == "END":
-                            break
-                        else:
-                            token_join.append(new_token)
-                            yield f"data: {{\"status\": \"success\", \"message\": \"{new_token}\"}}\n\n"
-                        await asyncio.sleep(0.1)
+                    # Stream the summary character by character
+                    for char in final_summary:
+                        yield f"data: {{\"status\": \"success\", \"message\": \"{char}\"}}\n\n"
+                        await asyncio.sleep(0.01)
+                    yield f"data: {{\"status\": \"completed\"}}\n\n"
                 except Exception as e:
                     yield f"data: {{\"status\": \"error\", \"message\": \"{e}\"}}\n\n"
-
-            # Return the flow response
+            
             return StreamingResponse(
                 generate_summary(),
                 media_type="text/event-stream"
             )
-
+            
         except Exception as e:
-            raise Exception(f"{str(e)}")
-
-    @staticmethod
-    def _clean_chunks_for_summary(all_documents):
-        # Only use these three fields for summarization
-        all_chunks = ""
-        for _, chunk in enumerate(all_documents['documents']):
-            all_chunks = all_chunks + "\n" + \
-                chunk["title"] + "\n" + chunk["filename"] + \
-                "\n" + chunk["content"]
-        return all_chunks
+            logger.error(f"Knowledge base summary generation failed: {str(e)}", exc_info=True)
+            raise Exception(f"Failed to generate summary: {str(e)}")
 
     @staticmethod
     def get_random_documents(