Skip to content

Commit 084f5f5

Browse files
authored
feat: update (#1443)
* feat: update * feat: update
1 parent a02bb88 commit 084f5f5

File tree

5 files changed

+69
-125
lines changed

5 files changed

+69
-125
lines changed

aperag/graph/lightrag/lightrag.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
extract_entities,
7070
merge_nodes_and_edges,
7171
)
72-
from .prompt import GRAPH_FIELD_SEP, PROMPTS
72+
from .prompt import DEFAULT_ENTITY_TYPES, GRAPH_FIELD_SEP
7373
from .types import KnowledgeGraph
7474
from .utils import (
7575
EmbeddingFunc,
@@ -228,7 +228,7 @@ class LightRAG:
228228
language: str = field(default="English")
229229
"""Language for entity extraction and query responses."""
230230

231-
entity_types: list[str] = field(default_factory=lambda: PROMPTS["DEFAULT_ENTITY_TYPES"])
231+
entity_types: list[str] = field(default_factory=lambda: DEFAULT_ENTITY_TYPES)
232232
"""List of entity types to extract during graph indexing."""
233233

234234
example_number: int | None = field(default=None)

aperag/graph/lightrag/operate.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@
5050
QueryParam,
5151
TextChunkSchema,
5252
)
53-
from .prompt import GRAPH_FIELD_SEP, PROMPTS
53+
from .prompt import (
54+
DEFAULT_COMPLETION_DELIMITER,
55+
DEFAULT_RECORD_DELIMITER,
56+
DEFAULT_TUPLE_DELIMITER,
57+
GRAPH_FIELD_SEP,
58+
PROMPTS,
59+
)
5460
from .types import GraphNodeData, GraphNodeDataDict, MergeSuggestion
5561
from .utils import (
5662
LightRAGLogger,
@@ -654,9 +660,9 @@ async def extract_entities(
654660
examples = "\n".join(PROMPTS["entity_extraction_examples"])
655661

656662
example_context_base = dict(
657-
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
658-
record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
659-
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
663+
tuple_delimiter=DEFAULT_TUPLE_DELIMITER,
664+
record_delimiter=DEFAULT_RECORD_DELIMITER,
665+
completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
660666
entity_types=", ".join(entity_types),
661667
language=language,
662668
)
@@ -665,9 +671,9 @@ async def extract_entities(
665671

666672
entity_extract_prompt = PROMPTS["entity_extraction"]
667673
context_base = dict(
668-
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
669-
record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
670-
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
674+
tuple_delimiter=DEFAULT_TUPLE_DELIMITER,
675+
record_delimiter=DEFAULT_RECORD_DELIMITER,
676+
completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
671677
entity_types=",".join(entity_types),
672678
examples=examples,
673679
language=language,
@@ -2072,13 +2078,11 @@ async def _batch_analyze_entities_with_llm(
20722078
entities_text += f"- Degree: {entity.degree or 0}\n\n"
20732079

20742080
# Use prompt from prompts.py
2075-
from .prompt import PROMPTS
2076-
20772081
prompt = PROMPTS["batch_merge_analysis"].format(
20782082
entities_list=entities_text,
2079-
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
2080-
record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
2081-
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
2083+
tuple_delimiter=DEFAULT_TUPLE_DELIMITER,
2084+
record_delimiter=DEFAULT_RECORD_DELIMITER,
2085+
completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
20822086
graph_field_sep=GRAPH_FIELD_SEP,
20832087
)
20842088

@@ -2132,9 +2136,7 @@ def parse_llm_merge_response(
21322136
entity_lookup = {(entity.entity_name or entity.entity_id): entity for entity in entities_list}
21332137

21342138
# Split by record delimiter
2135-
from .prompt import PROMPTS
2136-
2137-
records = llm_response.split(PROMPTS["DEFAULT_RECORD_DELIMITER"])
2139+
records = llm_response.split(DEFAULT_RECORD_DELIMITER)
21382140

21392141
if lightrag_logger:
21402142
lightrag_logger.debug(f"Parsing LLM response: found {len(records)} potential records")
@@ -2144,7 +2146,7 @@ def parse_llm_merge_response(
21442146

21452147
for i, record in enumerate(records):
21462148
record = record.strip()
2147-
if not record or PROMPTS["DEFAULT_COMPLETION_DELIMITER"] in record:
2149+
if not record or DEFAULT_COMPLETION_DELIMITER in record:
21482150
continue
21492151

21502152
suggestion = parse_single_merge_record(record, entity_lookup, confidence_threshold, lightrag_logger)
@@ -2185,8 +2187,6 @@ def parse_single_merge_record(
21852187
MergeSuggestion if successfully parsed and meets threshold, None otherwise
21862188
"""
21872189
try:
2188-
# Import required constants and types
2189-
from .prompt import GRAPH_FIELD_SEP, PROMPTS
21902190
from .types import GraphNodeData
21912191

21922192
# Extract content between quotes and parentheses
@@ -2195,7 +2195,7 @@ def parse_single_merge_record(
21952195
content = content[:-1]
21962196

21972197
# Parse the content using tuple delimiter
2198-
parts = content.split(PROMPTS["DEFAULT_TUPLE_DELIMITER"])
2198+
parts = content.split(DEFAULT_TUPLE_DELIMITER)
21992199

22002200
# Filter out empty parts (especially the first one if content starts with delimiter)
22012201
parts = [part.strip() for part in parts if part.strip()]

aperag/graph/lightrag/prompt.py

Lines changed: 18 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,10 @@
3636
from typing import Any
3737

3838
GRAPH_FIELD_SEP = "<SEP>"
39-
40-
PROMPTS: dict[str, Any] = {}
41-
42-
PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
43-
PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
44-
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
45-
46-
PROMPTS["DEFAULT_ENTITY_TYPES"] = [
39+
DEFAULT_TUPLE_DELIMITER = "<|>"
40+
DEFAULT_RECORD_DELIMITER = "##"
41+
DEFAULT_COMPLETION_DELIMITER = "<|COMPLETE|>"
42+
DEFAULT_ENTITY_TYPES = [
4743
"organization",
4844
"person",
4945
"geo",
@@ -54,13 +50,16 @@
5450
"category",
5551
]
5652

53+
PROMPTS: dict[str, Any] = {}
54+
55+
# Keys: language, entity_types, tuple_delimiter, record_delimiter, completion_delimiter, examples, input_text
5756
PROMPTS["entity_extraction"] = """---Goal---
5857
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
5958
Use {language} as output language.
6059
6160
---Steps---
6261
1. Identify all entities. For each identified entity, extract the following information:
63-
- entity_name: Full Name of the entity, must use **same language** as input text, it's important. If English, capitalized the name.
62+
- entity_name: Full Name of the entity, must use **same language** as Real Data Text, it's important. If English, capitalized the name.
6463
- entity_type: One of the following types: [{entity_types}]
6564
- entity_description: Comprehensive description of the entity's attributes and activities
6665
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
@@ -95,6 +94,7 @@
9594
######################
9695
Output:"""
9796

97+
# Keys: tuple_delimiter, record_delimiter, completion_delimiter (rendered into entity_extraction via {examples})
9898
PROMPTS["entity_extraction_examples"] = [
9999
"""Example 1:
100100
@@ -211,6 +211,7 @@
211211
#############################""",
212212
]
213213

214+
# Keys: language, entity_name, description_list
214215
PROMPTS[
215216
"summarize_entity_descriptions"
216217
] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
@@ -228,13 +229,14 @@
228229
Output:
229230
"""
230231

232+
# Keys: language, entity_types, tuple_delimiter, record_delimiter, completion_delimiter
231233
PROMPTS["entity_continue_extraction"] = """
232234
MANY entities and relationships were missed in the last extraction.
233235
234236
---Remember Steps---
235237
236238
1. Identify all entities. For each identified entity, extract the following information:
237-
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name.
239+
- entity_name: Name of the entity, use same language as Real Data Text. If English, capitalized the name.
238240
- entity_type: One of the following types: [{entity_types}]
239241
- entity_description: Comprehensive description of the entity's attributes and activities
240242
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
@@ -260,6 +262,7 @@
260262
Add them below using the same format:\n
261263
""".strip()
262264

265+
# Keys: (none)
263266
PROMPTS["entity_if_loop_extraction"] = """
264267
---Goal---'
265268
@@ -270,42 +273,10 @@
270273
Answer ONLY by `YES` OR `NO` if there are still entities that need to be added.
271274
""".strip()
272275

276+
# Keys: (none)
273277
PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question.[no-context]"
274278

275-
PROMPTS["rag_response"] = """---Role---
276-
277-
You are a helpful assistant responding to user query about Knowledge Graph and Document Chunks provided in JSON format below.
278-
279-
280-
---Goal---
281-
282-
Generate a concise response based on Knowledge Base and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Knowledge Base, and incorporating general knowledge relevant to the Knowledge Base. Do not include information not provided by Knowledge Base.
283-
284-
When handling relationships with timestamps:
285-
1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge
286-
2. When encountering conflicting relationships, consider both the semantic content and the timestamp
287-
3. Don't automatically prefer the most recently created relationships - use judgment based on the context
288-
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
289-
290-
---Conversation History---
291-
{history}
292-
293-
---Knowledge Graph and Document Chunks---
294-
{context_data}
295-
296-
---Response Rules---
297-
298-
- Target format and length: {response_type}
299-
- Use markdown formatting with appropriate section headings
300-
- Please respond in the same language as the user's question.
301-
- Ensure the response maintains continuity with the conversation history.
302-
- List up to 5 most important reference sources at the end under "References" section. Clearly indicating whether each source is from Knowledge Graph (KG) or Document Chunks (DC), and include the file path if available, in the following format: [KG/DC] file_path
303-
- If you don't know the answer, just say so.
304-
- Do not make anything up. Do not include information not provided by the Knowledge Base.
305-
- Addtional user prompt: {user_prompt}
306-
307-
Response:"""
308-
279+
# Keys: examples, history, query
309280
PROMPTS["keywords_extraction"] = """---Role---
310281
311282
You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query and conversation history.
@@ -335,11 +306,12 @@
335306
336307
Current Query: {query}
337308
######################
338-
The `Output` should be human text, not unicode characters. Keep the same language as `Query`.
309+
The `Output` should be human text, not unicode characters. Keep the same language as `Current Query`.
339310
Output:
340311
341312
"""
342313

314+
# Keys: (none, static examples rendered into keywords_extraction via {examples})
343315
PROMPTS["keywords_extraction_examples"] = [
344316
"""Example 1:
345317
@@ -373,39 +345,7 @@
373345
#############################""",
374346
]
375347

376-
PROMPTS["naive_rag_response"] = """---Role---
377-
378-
You are a helpful assistant responding to user query about Document Chunks provided provided in JSON format below.
379-
380-
---Goal---
381-
382-
Generate a concise response based on Document Chunks and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Document Chunks, and incorporating general knowledge relevant to the Document Chunks. Do not include information not provided by Document Chunks.
383-
384-
When handling content with timestamps:
385-
1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge
386-
2. When encountering conflicting information, consider both the content and the timestamp
387-
3. Don't automatically prefer the most recent content - use judgment based on the context
388-
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
389-
390-
---Conversation History---
391-
{history}
392-
393-
---Document Chunks(DC)---
394-
{content_data}
395-
396-
---Response Rules---
397-
398-
- Target format and length: {response_type}
399-
- Use markdown formatting with appropriate section headings
400-
- Please respond in the same language as the user's question.
401-
- Ensure the response maintains continuity with the conversation history.
402-
- List up to 5 most important reference sources at the end under "References" section. Clearly indicating each source from Document Chunks(DC), and include the file path if available, in the following format: [DC] file_path
403-
- If you don't know the answer, just say so.
404-
- Do not include information not provided by the Document Chunks.
405-
- Addtional user prompt: {user_prompt}
406-
407-
Response:"""
408-
348+
# Keys: tuple_delimiter, record_delimiter, completion_delimiter, graph_field_sep, entities_list
409349
PROMPTS["batch_merge_analysis"] = """---Goal---
410350
Given a list of entities from a knowledge graph, identify groups of entities that should be merged because they refer to the EXACT SAME real-world object/individual/specific instance.
411351

aperag/graph/lightrag_manager.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from aperag.db.models import Collection
2323
from aperag.db.ops import db_ops
2424
from aperag.graph.lightrag import LightRAG
25-
from aperag.graph.lightrag.prompt import PROMPTS
25+
from aperag.graph.lightrag.prompt import DEFAULT_ENTITY_TYPES
2626
from aperag.graph.lightrag.utils import EmbeddingFunc
2727
from aperag.llm.embed.base_embedding import get_collection_embedding_service_sync
2828
from aperag.llm.llm_error_types import (
@@ -47,7 +47,7 @@ class LightRAGConfig:
4747
SUMMARY_TO_MAX_TOKENS = 2000
4848
FORCE_LLM_SUMMARY_ON_MERGE = 10
4949
EMBEDDING_MAX_TOKEN_SIZE = 8192
50-
DEFAULT_LANGUAGE = "simplified chinese"
50+
DEFAULT_LANGUAGE = "zh-CN"
5151

5252

5353
class LightRAGError(Exception):
@@ -82,7 +82,7 @@ async def create_lightrag_instance(collection: Collection) -> LightRAG:
8282
config = parseCollectionConfig(collection.config)
8383
kg_config = config.knowledge_graph_config
8484
language = LightRAGConfig.DEFAULT_LANGUAGE
85-
entity_types = PROMPTS["DEFAULT_ENTITY_TYPES"]
85+
entity_types = DEFAULT_ENTITY_TYPES
8686

8787
# Use collection-level language if available
8888
if config.language:

0 commit comments

Comments
 (0)