Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions aperag/graph/lightrag/lightrag.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
extract_entities,
merge_nodes_and_edges,
)
from .prompt import GRAPH_FIELD_SEP, PROMPTS
from .prompt import DEFAULT_ENTITY_TYPES, GRAPH_FIELD_SEP
from .types import KnowledgeGraph
from .utils import (
EmbeddingFunc,
Expand Down Expand Up @@ -228,7 +228,7 @@ class LightRAG:
language: str = field(default="English")
"""Language for entity extraction and query responses."""

entity_types: list[str] = field(default_factory=lambda: PROMPTS["DEFAULT_ENTITY_TYPES"])
entity_types: list[str] = field(default_factory=lambda: DEFAULT_ENTITY_TYPES)
"""List of entity types to extract during graph indexing."""

example_number: int | None = field(default=None)
Expand Down
38 changes: 19 additions & 19 deletions aperag/graph/lightrag/operate.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@
QueryParam,
TextChunkSchema,
)
from .prompt import GRAPH_FIELD_SEP, PROMPTS
from .prompt import (
DEFAULT_COMPLETION_DELIMITER,
DEFAULT_RECORD_DELIMITER,
DEFAULT_TUPLE_DELIMITER,
GRAPH_FIELD_SEP,
PROMPTS,
)
from .types import GraphNodeData, GraphNodeDataDict, MergeSuggestion
from .utils import (
LightRAGLogger,
Expand Down Expand Up @@ -654,9 +660,9 @@ async def extract_entities(
examples = "\n".join(PROMPTS["entity_extraction_examples"])

example_context_base = dict(
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
tuple_delimiter=DEFAULT_TUPLE_DELIMITER,
record_delimiter=DEFAULT_RECORD_DELIMITER,
completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
entity_types=", ".join(entity_types),
language=language,
)
Expand All @@ -665,9 +671,9 @@ async def extract_entities(

entity_extract_prompt = PROMPTS["entity_extraction"]
context_base = dict(
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
tuple_delimiter=DEFAULT_TUPLE_DELIMITER,
record_delimiter=DEFAULT_RECORD_DELIMITER,
completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
entity_types=",".join(entity_types),
examples=examples,
language=language,
Expand Down Expand Up @@ -2072,13 +2078,11 @@ async def _batch_analyze_entities_with_llm(
entities_text += f"- Degree: {entity.degree or 0}\n\n"

# Use prompt from prompts.py
from .prompt import PROMPTS

prompt = PROMPTS["batch_merge_analysis"].format(
entities_list=entities_text,
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
tuple_delimiter=DEFAULT_TUPLE_DELIMITER,
record_delimiter=DEFAULT_RECORD_DELIMITER,
completion_delimiter=DEFAULT_COMPLETION_DELIMITER,
graph_field_sep=GRAPH_FIELD_SEP,
)

Expand Down Expand Up @@ -2132,9 +2136,7 @@ def parse_llm_merge_response(
entity_lookup = {(entity.entity_name or entity.entity_id): entity for entity in entities_list}

# Split by record delimiter
from .prompt import PROMPTS

records = llm_response.split(PROMPTS["DEFAULT_RECORD_DELIMITER"])
records = llm_response.split(DEFAULT_RECORD_DELIMITER)

if lightrag_logger:
lightrag_logger.debug(f"Parsing LLM response: found {len(records)} potential records")
Expand All @@ -2144,7 +2146,7 @@ def parse_llm_merge_response(

for i, record in enumerate(records):
record = record.strip()
if not record or PROMPTS["DEFAULT_COMPLETION_DELIMITER"] in record:
if not record or DEFAULT_COMPLETION_DELIMITER in record:
continue

suggestion = parse_single_merge_record(record, entity_lookup, confidence_threshold, lightrag_logger)
Expand Down Expand Up @@ -2185,8 +2187,6 @@ def parse_single_merge_record(
MergeSuggestion if successfully parsed and meets threshold, None otherwise
"""
try:
# Import required constants and types
from .prompt import GRAPH_FIELD_SEP, PROMPTS
from .types import GraphNodeData

# Extract content between quotes and parentheses
Expand All @@ -2195,7 +2195,7 @@ def parse_single_merge_record(
content = content[:-1]

# Parse the content using tuple delimiter
parts = content.split(PROMPTS["DEFAULT_TUPLE_DELIMITER"])
parts = content.split(DEFAULT_TUPLE_DELIMITER)

# Filter out empty parts (especially the first one if content starts with delimiter)
parts = [part.strip() for part in parts if part.strip()]
Expand Down
96 changes: 18 additions & 78 deletions aperag/graph/lightrag/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,10 @@
from typing import Any

GRAPH_FIELD_SEP = "<SEP>"

PROMPTS: dict[str, Any] = {}

PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"

PROMPTS["DEFAULT_ENTITY_TYPES"] = [
DEFAULT_TUPLE_DELIMITER = "<|>"
DEFAULT_RECORD_DELIMITER = "##"
DEFAULT_COMPLETION_DELIMITER = "<|COMPLETE|>"
DEFAULT_ENTITY_TYPES = [
"organization",
"person",
"geo",
Expand All @@ -54,13 +50,16 @@
"category",
]

PROMPTS: dict[str, Any] = {}

# Keys: language, entity_types, tuple_delimiter, record_delimiter, completion_delimiter, examples, input_text
PROMPTS["entity_extraction"] = """---Goal---
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
Use {language} as output language.

---Steps---
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Full Name of the entity, must use **same language** as input text, it's important. If English, capitalized the name.
- entity_name: Full Name of the entity, must use **same language** as Real Data Text, it's important. If English, capitalized the name.
- entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
Expand Down Expand Up @@ -95,6 +94,7 @@
######################
Output:"""

# Keys: tuple_delimiter, record_delimiter, completion_delimiter (rendered into entity_extraction via {examples})
PROMPTS["entity_extraction_examples"] = [
"""Example 1:

Expand Down Expand Up @@ -211,6 +211,7 @@
#############################""",
]

# Keys: language, entity_name, description_list
PROMPTS[
"summarize_entity_descriptions"
] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
Expand All @@ -228,13 +229,14 @@
Output:
"""

# Keys: language, entity_types, tuple_delimiter, record_delimiter, completion_delimiter
PROMPTS["entity_continue_extraction"] = """
MANY entities and relationships were missed in the last extraction.

---Remember Steps---

1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name.
- entity_name: Name of the entity, use same language as Real Data Text. If English, capitalized the name.
- entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
Expand All @@ -260,6 +262,7 @@
Add them below using the same format:\n
""".strip()

# Keys: (none)
PROMPTS["entity_if_loop_extraction"] = """
---Goal---'

Expand All @@ -270,42 +273,10 @@
Answer ONLY by `YES` OR `NO` if there are still entities that need to be added.
""".strip()

# Keys: (none)
PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question.[no-context]"

PROMPTS["rag_response"] = """---Role---

You are a helpful assistant responding to user query about Knowledge Graph and Document Chunks provided in JSON format below.


---Goal---

Generate a concise response based on Knowledge Base and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Knowledge Base, and incorporating general knowledge relevant to the Knowledge Base. Do not include information not provided by Knowledge Base.

When handling relationships with timestamps:
1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge
2. When encountering conflicting relationships, consider both the semantic content and the timestamp
3. Don't automatically prefer the most recently created relationships - use judgment based on the context
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps

---Conversation History---
{history}

---Knowledge Graph and Document Chunks---
{context_data}

---Response Rules---

- Target format and length: {response_type}
- Use markdown formatting with appropriate section headings
- Please respond in the same language as the user's question.
- Ensure the response maintains continuity with the conversation history.
- List up to 5 most important reference sources at the end under "References" section. Clearly indicating whether each source is from Knowledge Graph (KG) or Document Chunks (DC), and include the file path if available, in the following format: [KG/DC] file_path
- If you don't know the answer, just say so.
- Do not make anything up. Do not include information not provided by the Knowledge Base.
- Addtional user prompt: {user_prompt}

Response:"""

# Keys: examples, history, query
PROMPTS["keywords_extraction"] = """---Role---

You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query and conversation history.
Expand Down Expand Up @@ -335,11 +306,12 @@

Current Query: {query}
######################
The `Output` should be human text, not unicode characters. Keep the same language as `Query`.
The `Output` should be human text, not unicode characters. Keep the same language as `Current Query`.
Output:

"""

# Keys: (none, static examples rendered into keywords_extraction via {examples})
PROMPTS["keywords_extraction_examples"] = [
"""Example 1:

Expand Down Expand Up @@ -373,39 +345,7 @@
#############################""",
]

PROMPTS["naive_rag_response"] = """---Role---

You are a helpful assistant responding to user query about Document Chunks provided provided in JSON format below.

---Goal---

Generate a concise response based on Document Chunks and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Document Chunks, and incorporating general knowledge relevant to the Document Chunks. Do not include information not provided by Document Chunks.

When handling content with timestamps:
1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge
2. When encountering conflicting information, consider both the content and the timestamp
3. Don't automatically prefer the most recent content - use judgment based on the context
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps

---Conversation History---
{history}

---Document Chunks(DC)---
{content_data}

---Response Rules---

- Target format and length: {response_type}
- Use markdown formatting with appropriate section headings
- Please respond in the same language as the user's question.
- Ensure the response maintains continuity with the conversation history.
- List up to 5 most important reference sources at the end under "References" section. Clearly indicating each source from Document Chunks(DC), and include the file path if available, in the following format: [DC] file_path
- If you don't know the answer, just say so.
- Do not include information not provided by the Document Chunks.
- Addtional user prompt: {user_prompt}

Response:"""

# Keys: tuple_delimiter, record_delimiter, completion_delimiter, graph_field_sep, entities_list
PROMPTS["batch_merge_analysis"] = """---Goal---
Given a list of entities from a knowledge graph, identify groups of entities that should be merged because they refer to the EXACT SAME real-world object/individual/specific instance.

Expand Down
6 changes: 3 additions & 3 deletions aperag/graph/lightrag_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from aperag.db.models import Collection
from aperag.db.ops import db_ops
from aperag.graph.lightrag import LightRAG
from aperag.graph.lightrag.prompt import PROMPTS
from aperag.graph.lightrag.prompt import DEFAULT_ENTITY_TYPES
from aperag.graph.lightrag.utils import EmbeddingFunc
from aperag.llm.embed.base_embedding import get_collection_embedding_service_sync
from aperag.llm.llm_error_types import (
Expand All @@ -47,7 +47,7 @@ class LightRAGConfig:
SUMMARY_TO_MAX_TOKENS = 2000
FORCE_LLM_SUMMARY_ON_MERGE = 10
EMBEDDING_MAX_TOKEN_SIZE = 8192
DEFAULT_LANGUAGE = "simplified chinese"
DEFAULT_LANGUAGE = "zh-CN"


class LightRAGError(Exception):
Expand Down Expand Up @@ -82,7 +82,7 @@ async def create_lightrag_instance(collection: Collection) -> LightRAG:
config = parseCollectionConfig(collection.config)
kg_config = config.knowledge_graph_config
language = LightRAGConfig.DEFAULT_LANGUAGE
entity_types = PROMPTS["DEFAULT_ENTITY_TYPES"]
entity_types = DEFAULT_ENTITY_TYPES

# Use collection-level language if available
if config.language:
Expand Down
Loading
Loading