feat: update (#1443)

earayu · web-flow · commit 084f5f5bc584 · 2026-03-03T14:05:06.000+08:00
* feat: update

* feat: update
diff --git a/aperag/graph/lightrag/lightrag.py b/aperag/graph/lightrag/lightrag.py
@@ -69,7 +69,7 @@
     extract_entities,
     merge_nodes_and_edges,
 )
-from .prompt import GRAPH_FIELD_SEP, PROMPTS
+from .prompt import DEFAULT_ENTITY_TYPES, GRAPH_FIELD_SEP
 from .types import KnowledgeGraph
 from .utils import (
     EmbeddingFunc,
@@ -228,7 +228,7 @@ class LightRAG:
     language: str = field(default="English")
     """Language for entity extraction and query responses."""
 
-    entity_types: list[str] = field(default_factory=lambda: PROMPTS["DEFAULT_ENTITY_TYPES"])
+    entity_types: list[str] = field(default_factory=lambda: DEFAULT_ENTITY_TYPES)
     """List of entity types to extract during graph indexing."""
 
     example_number: int | None = field(default=None)
diff --git a/aperag/graph/lightrag/operate.py b/aperag/graph/lightrag/operate.py
@@ -50,7 +50,13 @@
     QueryParam,
     TextChunkSchema,
 )
-from .prompt import GRAPH_FIELD_SEP, PROMPTS
+from .prompt import (
+    DEFAULT_COMPLETION_DELIMITER,
+    DEFAULT_RECORD_DELIMITER,
+    DEFAULT_TUPLE_DELIMITER,
+    GRAPH_FIELD_SEP,
+    PROMPTS,
+)
 from .types import GraphNodeData, GraphNodeDataDict, MergeSuggestion
 from .utils import (
     LightRAGLogger,
@@ -654,9 +660,9 @@ async def extract_entities(
         examples = "\n".join(PROMPTS["entity_extraction_examples"])
 
     example_context_base = dict(
-        tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
-        record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
-        completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
+        tuple_delimiter=DEFAULT_TUPLE_DELIMITER,
+        record_delimiter=DEFAULT_RECORD_DELIMITER,
+        completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
         entity_types=", ".join(entity_types),
         language=language,
     )
@@ -665,9 +671,9 @@ async def extract_entities(
 
     entity_extract_prompt = PROMPTS["entity_extraction"]
     context_base = dict(
-        tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
-        record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
-        completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
+        tuple_delimiter=DEFAULT_TUPLE_DELIMITER,
+        record_delimiter=DEFAULT_RECORD_DELIMITER,
+        completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
         entity_types=",".join(entity_types),
         examples=examples,
         language=language,
@@ -2072,13 +2078,11 @@ async def _batch_analyze_entities_with_llm(
             entities_text += f"- Degree: {entity.degree or 0}\n\n"
 
         # Use prompt from prompts.py
-        from .prompt import PROMPTS
-
         prompt = PROMPTS["batch_merge_analysis"].format(
             entities_list=entities_text,
-            tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
-            record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
-            completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
+            tuple_delimiter=DEFAULT_TUPLE_DELIMITER,
+            record_delimiter=DEFAULT_RECORD_DELIMITER,
+            completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
             graph_field_sep=GRAPH_FIELD_SEP,
         )
 
@@ -2132,9 +2136,7 @@ def parse_llm_merge_response(
         entity_lookup = {(entity.entity_name or entity.entity_id): entity for entity in entities_list}
 
         # Split by record delimiter
-        from .prompt import PROMPTS
-
-        records = llm_response.split(PROMPTS["DEFAULT_RECORD_DELIMITER"])
+        records = llm_response.split(DEFAULT_RECORD_DELIMITER)
 
         if lightrag_logger:
             lightrag_logger.debug(f"Parsing LLM response: found {len(records)} potential records")
@@ -2144,7 +2146,7 @@ def parse_llm_merge_response(
 
         for i, record in enumerate(records):
             record = record.strip()
-            if not record or PROMPTS["DEFAULT_COMPLETION_DELIMITER"] in record:
+            if not record or DEFAULT_COMPLETION_DELIMITER in record:
                 continue
 
             suggestion = parse_single_merge_record(record, entity_lookup, confidence_threshold, lightrag_logger)
@@ -2185,8 +2187,6 @@ def parse_single_merge_record(
         MergeSuggestion if successfully parsed and meets threshold, None otherwise
     """
     try:
-        # Import required constants and types
-        from .prompt import GRAPH_FIELD_SEP, PROMPTS
         from .types import GraphNodeData
 
         # Extract content between quotes and parentheses
@@ -2195,7 +2195,7 @@ def parse_single_merge_record(
             content = content[:-1]
 
         # Parse the content using tuple delimiter
-        parts = content.split(PROMPTS["DEFAULT_TUPLE_DELIMITER"])
+        parts = content.split(DEFAULT_TUPLE_DELIMITER)
 
         # Filter out empty parts (especially the first one if content starts with delimiter)
         parts = [part.strip() for part in parts if part.strip()]
diff --git a/aperag/graph/lightrag/prompt.py b/aperag/graph/lightrag/prompt.py
@@ -36,14 +36,10 @@
 from typing import Any
 
 GRAPH_FIELD_SEP = "<SEP>"
-
-PROMPTS: dict[str, Any] = {}
-
-PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
-PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
-PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
-
-PROMPTS["DEFAULT_ENTITY_TYPES"] = [
+DEFAULT_TUPLE_DELIMITER = "<|>"
+DEFAULT_RECORD_DELIMITER = "##"
+DEFAULT_COMPLETION_DELIMITER = "<|COMPLETE|>"
+DEFAULT_ENTITY_TYPES = [
     "organization",
     "person",
     "geo",
@@ -54,13 +50,16 @@
     "category",
 ]
 
+PROMPTS: dict[str, Any] = {}
+
+# Keys: language, entity_types, tuple_delimiter, record_delimiter, completion_delimiter, examples, input_text
 PROMPTS["entity_extraction"] = """---Goal---
 Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
 Use {language} as output language.
 
 ---Steps---
 1. Identify all entities. For each identified entity, extract the following information:
-- entity_name: Full Name of the entity, must use **same language** as input text, it's important. If English, capitalized the name.
+- entity_name: Full Name of the entity, must use **same language** as Real Data Text, it's important. If English, capitalized the name.
 - entity_type: One of the following types: [{entity_types}]
 - entity_description: Comprehensive description of the entity's attributes and activities
 Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
@@ -95,6 +94,7 @@
 ######################
 Output:"""
 
+# Keys: tuple_delimiter, record_delimiter, completion_delimiter  (rendered into entity_extraction via {examples})
 PROMPTS["entity_extraction_examples"] = [
     """Example 1:
 
@@ -211,6 +211,7 @@
 #############################""",
 ]
 
+# Keys: language, entity_name, description_list
 PROMPTS[
     "summarize_entity_descriptions"
 ] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
@@ -228,13 +229,14 @@
 Output:
 """
 
+# Keys: language, entity_types, tuple_delimiter, record_delimiter, completion_delimiter
 PROMPTS["entity_continue_extraction"] = """
 MANY entities and relationships were missed in the last extraction.
 
 ---Remember Steps---
 
 1. Identify all entities. For each identified entity, extract the following information:
-- entity_name: Name of the entity, use same language as input text. If English, capitalized the name.
+- entity_name: Name of the entity, use same language as Real Data Text. If English, capitalized the name.
 - entity_type: One of the following types: [{entity_types}]
 - entity_description: Comprehensive description of the entity's attributes and activities
 Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
@@ -260,6 +262,7 @@
 Add them below using the same format:\n
 """.strip()
 
+# Keys: (none)
 PROMPTS["entity_if_loop_extraction"] = """
 ---Goal---'
 
@@ -270,42 +273,10 @@
 Answer ONLY by `YES` OR `NO` if there are still entities that need to be added.
 """.strip()
 
+# Keys: (none)
 PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question.[no-context]"
 
-PROMPTS["rag_response"] = """---Role---
-
-You are a helpful assistant responding to user query about Knowledge Graph and Document Chunks provided in JSON format below.
-
-
----Goal---
-
-Generate a concise response based on Knowledge Base and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Knowledge Base, and incorporating general knowledge relevant to the Knowledge Base. Do not include information not provided by Knowledge Base.
-
-When handling relationships with timestamps:
-1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge
-2. When encountering conflicting relationships, consider both the semantic content and the timestamp
-3. Don't automatically prefer the most recently created relationships - use judgment based on the context
-4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
-
----Conversation History---
-{history}
-
----Knowledge Graph and Document Chunks---
-{context_data}
-
----Response Rules---
-
-- Target format and length: {response_type}
-- Use markdown formatting with appropriate section headings
-- Please respond in the same language as the user's question.
-- Ensure the response maintains continuity with the conversation history.
-- List up to 5 most important reference sources at the end under "References" section. Clearly indicating whether each source is from Knowledge Graph (KG) or Document Chunks (DC), and include the file path if available, in the following format: [KG/DC] file_path
-- If you don't know the answer, just say so.
-- Do not make anything up. Do not include information not provided by the Knowledge Base.
-- Addtional user prompt: {user_prompt}
-
-Response:"""
-
+# Keys: examples, history, query
 PROMPTS["keywords_extraction"] = """---Role---
 
 You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query and conversation history.
@@ -335,11 +306,12 @@
 
 Current Query: {query}
 ######################
-The `Output` should be human text, not unicode characters. Keep the same language as `Query`.
+The `Output` should be human text, not unicode characters. Keep the same language as `Current Query`.
 Output:
 
 """
 
+# Keys: (none, static examples rendered into keywords_extraction via {examples})
 PROMPTS["keywords_extraction_examples"] = [
     """Example 1:
 
@@ -373,39 +345,7 @@
 #############################""",
 ]
 
-PROMPTS["naive_rag_response"] = """---Role---
-
-You are a helpful assistant responding to user query about Document Chunks provided provided in JSON format below.
-
----Goal---
-
-Generate a concise response based on Document Chunks and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Document Chunks, and incorporating general knowledge relevant to the Document Chunks. Do not include information not provided by Document Chunks.
-
-When handling content with timestamps:
-1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge
-2. When encountering conflicting information, consider both the content and the timestamp
-3. Don't automatically prefer the most recent content - use judgment based on the context
-4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
-
----Conversation History---
-{history}
-
----Document Chunks(DC)---
-{content_data}
-
----Response Rules---
-
-- Target format and length: {response_type}
-- Use markdown formatting with appropriate section headings
-- Please respond in the same language as the user's question.
-- Ensure the response maintains continuity with the conversation history.
-- List up to 5 most important reference sources at the end under "References" section. Clearly indicating each source from Document Chunks(DC), and include the file path if available, in the following format: [DC] file_path
-- If you don't know the answer, just say so.
-- Do not include information not provided by the Document Chunks.
-- Addtional user prompt: {user_prompt}
-
-Response:"""
-
+# Keys: tuple_delimiter, record_delimiter, completion_delimiter, graph_field_sep, entities_list
 PROMPTS["batch_merge_analysis"] = """---Goal---
 Given a list of entities from a knowledge graph, identify groups of entities that should be merged because they refer to the EXACT SAME real-world object/individual/specific instance.
 
diff --git a/aperag/graph/lightrag_manager.py b/aperag/graph/lightrag_manager.py
@@ -22,7 +22,7 @@
 from aperag.db.models import Collection
 from aperag.db.ops import db_ops
 from aperag.graph.lightrag import LightRAG
-from aperag.graph.lightrag.prompt import PROMPTS
+from aperag.graph.lightrag.prompt import DEFAULT_ENTITY_TYPES
 from aperag.graph.lightrag.utils import EmbeddingFunc
 from aperag.llm.embed.base_embedding import get_collection_embedding_service_sync
 from aperag.llm.llm_error_types import (
@@ -47,7 +47,7 @@ class LightRAGConfig:
     SUMMARY_TO_MAX_TOKENS = 2000
     FORCE_LLM_SUMMARY_ON_MERGE = 10
     EMBEDDING_MAX_TOKEN_SIZE = 8192
-    DEFAULT_LANGUAGE = "simplified chinese"
+    DEFAULT_LANGUAGE = "zh-CN"
 
 
 class LightRAGError(Exception):
@@ -82,7 +82,7 @@ async def create_lightrag_instance(collection: Collection) -> LightRAG:
         config = parseCollectionConfig(collection.config)
         kg_config = config.knowledge_graph_config
         language = LightRAGConfig.DEFAULT_LANGUAGE
-        entity_types = PROMPTS["DEFAULT_ENTITY_TYPES"]
+        entity_types = DEFAULT_ENTITY_TYPES
 
         # Use collection-level language if available
         if config.language:
diff --git a/tests/unit_test/graphindex/test_merge_suggestions.py b/tests/unit_test/graphindex/test_merge_suggestions.py