Skip to content

Commit 65fdbfa

Browse files
authored
feat: support_customize_entity_types (#1403)
* feat: support_customize_entity_types * feat: support_customize_entity_types
1 parent d8695be commit 65fdbfa

File tree

10 files changed

+209
-33
lines changed

10 files changed

+209
-33
lines changed

aperag/api/components/schemas/collection.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,23 @@ collectionSource:
109109
type: string
110110

111111

112+
knowledgeGraphConfig:
113+
type: object
114+
description: Configuration for knowledge graph generation
115+
properties:
116+
language:
117+
type: string
118+
description: Language for entity extraction and query responses
119+
default: "English"
120+
example: "English"
121+
entity_types:
122+
type: array
123+
items:
124+
type: string
125+
description: List of entity types to extract during graph indexing
126+
default: ["organization", "person", "geo", "event", "product", "technology", "date", "category"]
127+
example: ["organization", "person", "geo", "event"]
128+
112129
collectionConfig:
113130
type: object
114131
properties:
@@ -136,6 +153,12 @@ collectionConfig:
136153
type: boolean
137154
description: Whether to enable vision index
138155
default: false
156+
knowledge_graph_config:
157+
allOf:
158+
- $ref: '#/knowledgeGraphConfig'
159+
default:
160+
language: "English"
161+
entity_types: ["organization", "person", "geo", "event", "product", "technology", "date", "category"]
139162
embedding:
140163
$ref: './model.yaml#/modelSpec'
141164
completion:

aperag/graph/lightrag/lightrag.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,14 @@ class LightRAG:
229229
# Extensions
230230
# ---
231231

232-
addon_params: dict[str, Any] = field(
233-
default_factory=lambda: {"language": get_env_value("SUMMARY_LANGUAGE", "English", str)}
234-
)
232+
language: str = field(default="English")
233+
"""Language for entity extraction and query responses."""
234+
235+
entity_types: list[str] = field(default_factory=lambda: PROMPTS["DEFAULT_ENTITY_TYPES"])
236+
"""List of entity types to extract during graph indexing."""
237+
238+
example_number: int | None = field(default=None)
239+
"""Number of examples to use in prompts. If None, uses all available examples."""
235240

236241
# Storages Management
237242
# ---
@@ -544,7 +549,7 @@ async def _process_component_with_semaphore(task_data):
544549
tokenizer=self.tokenizer,
545550
llm_model_max_token_size=self.llm_model_max_token_size,
546551
summary_to_max_tokens=self.summary_to_max_tokens,
547-
addon_params=self.addon_params or PROMPTS["DEFAULT_LANGUAGE"],
552+
language=self.language,
548553
force_llm_summary_on_merge=self.force_llm_summary_on_merge,
549554
lightrag_logger=self.lightrag_logger,
550555
)
@@ -759,7 +764,9 @@ async def aprocess_graph_indexing(
759764
chunks,
760765
use_llm_func=self.llm_model_func,
761766
entity_extract_max_gleaning=self.entity_extract_max_gleaning,
762-
addon_params=self.addon_params,
767+
language=self.language,
768+
entity_types=self.entity_types,
769+
example_number=self.example_number,
763770
llm_model_max_async=self.llm_model_max_async,
764771
lightrag_logger=self.lightrag_logger,
765772
)
@@ -809,7 +816,8 @@ async def aquery_context(
809816
param,
810817
self.tokenizer,
811818
self.llm_model_func,
812-
self.addon_params,
819+
language=self.language,
820+
example_number=self.example_number,
813821
chunks_vdb=self.chunks_vdb,
814822
)
815823

@@ -898,7 +906,8 @@ async def aquery(
898906
param,
899907
self.tokenizer,
900908
self.llm_model_func,
901-
self.addon_params,
909+
language=self.language,
910+
example_number=self.example_number,
902911
system_prompt=system_prompt,
903912
chunks_vdb=self.chunks_vdb,
904913
)

aperag/graph/lightrag/operate.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ async def merge_nodes_and_edges(
501501
tokenizer,
502502
llm_model_max_token_size,
503503
summary_to_max_tokens,
504-
addon_params,
504+
language: str,
505505
force_llm_summary_on_merge,
506506
lightrag_logger: LightRAGLogger,
507507
) -> dict[str, int]:
@@ -516,7 +516,7 @@ async def merge_nodes_and_edges(
516516
tokenizer,
517517
llm_model_max_token_size,
518518
summary_to_max_tokens,
519-
addon_params,
519+
language,
520520
force_llm_summary_on_merge,
521521
lightrag_logger,
522522
)
@@ -532,15 +532,12 @@ async def _merge_nodes_and_edges_impl(
532532
tokenizer,
533533
llm_model_max_token_size,
534534
summary_to_max_tokens,
535-
addon_params,
535+
language: str,
536536
force_llm_summary_on_merge,
537537
lightrag_logger: LightRAGLogger,
538538
) -> dict[str, int]:
539539
"""Internal implementation of merge_nodes_and_edges with fine-grained locking"""
540540

541-
# Extract language from addon_params
542-
language = addon_params.get("language", "English")
543-
544541
# Collect all nodes and edges from all chunks
545542
all_nodes = defaultdict(list)
546543
all_edges = defaultdict(list)
@@ -572,7 +569,7 @@ async def _merge_nodes_and_edges_impl(
572569
tokenizer,
573570
llm_model_max_token_size,
574571
summary_to_max_tokens,
575-
language, # Pass language instead of addon_params
572+
language,
576573
force_llm_summary_on_merge,
577574
lightrag_logger,
578575
workspace,
@@ -613,7 +610,7 @@ async def _merge_nodes_and_edges_impl(
613610
tokenizer,
614611
llm_model_max_token_size,
615612
summary_to_max_tokens,
616-
language, # Pass language instead of addon_params
613+
language,
617614
force_llm_summary_on_merge,
618615
lightrag_logger,
619616
workspace,
@@ -644,15 +641,13 @@ async def extract_entities(
644641
chunks: dict[str, TextChunkSchema],
645642
use_llm_func: callable,
646643
entity_extract_max_gleaning: int,
647-
addon_params: dict,
644+
language: str,
645+
entity_types: list[str],
646+
example_number: int | None,
648647
llm_model_max_async: int,
649648
lightrag_logger: LightRAGLogger,
650649
) -> list:
651650
ordered_chunks = list(chunks.items())
652-
# add language and example number params to prompt
653-
language = addon_params.get("language", PROMPTS["DEFAULT_LANGUAGE"])
654-
entity_types = addon_params.get("entity_types", PROMPTS["DEFAULT_ENTITY_TYPES"])
655-
example_number = addon_params.get("example_number", None)
656651
if example_number and example_number < len(PROMPTS["entity_extraction_examples"]):
657652
examples = "\n".join(PROMPTS["entity_extraction_examples"][: int(example_number)])
658653
else:
@@ -824,7 +819,8 @@ async def build_query_context(
824819
query_param: QueryParam,
825820
tokenizer: Tokenizer,
826821
llm_model_func: callable,
827-
addon_params: dict,
822+
language: str,
823+
example_number: int | None,
828824
chunks_vdb: BaseVectorStorage = None,
829825
):
830826
if query_param.model_func:
@@ -833,7 +829,7 @@ async def build_query_context(
833829
use_model_func = llm_model_func
834830

835831
hl_keywords, ll_keywords = await get_keywords_from_query(
836-
query, query_param, tokenizer, use_model_func, addon_params
832+
query, query_param, tokenizer, use_model_func, language, example_number
837833
)
838834

839835
logger.debug(f"High-level keywords: {hl_keywords}")
@@ -882,7 +878,8 @@ async def kg_query(
882878
query_param: QueryParam,
883879
tokenizer: Tokenizer,
884880
llm_model_func: callable,
885-
addon_params: dict,
881+
language: str,
882+
example_number: int | None,
886883
system_prompt: str | None = None,
887884
chunks_vdb: BaseVectorStorage = None,
888885
) -> str | AsyncIterator[str]:
@@ -901,7 +898,8 @@ async def kg_query(
901898
query_param,
902899
tokenizer,
903900
llm_model_func,
904-
addon_params,
901+
language,
902+
example_number,
905903
chunks_vdb,
906904
)
907905

@@ -980,7 +978,8 @@ async def get_keywords_from_query(
980978
query_param: QueryParam,
981979
tokenizer: Tokenizer,
982980
llm_model_func: callable,
983-
addon_params: dict,
981+
language: str,
982+
example_number: int | None,
984983
) -> tuple[list[str], list[str]]:
985984
"""
986985
Retrieves high-level and low-level keywords for RAG operations.
@@ -996,7 +995,9 @@ async def get_keywords_from_query(
996995
return query_param.hl_keywords, query_param.ll_keywords
997996

998997
# Extract keywords using extract_keywords_only function which already supports conversation history
999-
hl_keywords, ll_keywords = await extract_keywords_only(query, query_param, tokenizer, llm_model_func, addon_params)
998+
hl_keywords, ll_keywords = await extract_keywords_only(
999+
query, query_param, tokenizer, llm_model_func, language, example_number
1000+
)
10001001
return hl_keywords, ll_keywords
10011002

10021003

@@ -1005,20 +1006,19 @@ async def extract_keywords_only(
10051006
param: QueryParam,
10061007
tokenizer: Tokenizer,
10071008
llm_model_func: callable,
1008-
addon_params: dict,
1009+
language: str,
1010+
example_number: int | None,
10091011
) -> tuple[list[str], list[str]]:
10101012
"""
10111013
Extract high-level and low-level keywords from the given 'text' using the LLM.
10121014
This method does NOT build the final RAG context or provide a final answer.
10131015
It ONLY extracts keywords (hl_keywords, ll_keywords).
10141016
"""
10151017
# 2. Build the examples
1016-
example_number = addon_params.get("example_number", None)
10171018
if example_number and example_number < len(PROMPTS["keywords_extraction_examples"]):
10181019
examples = "\n".join(PROMPTS["keywords_extraction_examples"][: int(example_number)])
10191020
else:
10201021
examples = "\n".join(PROMPTS["keywords_extraction_examples"])
1021-
language = addon_params.get("language", PROMPTS["DEFAULT_LANGUAGE"])
10221022

10231023
# 3. Process conversation history
10241024
history_context = ""

aperag/graph/lightrag/prompt.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939

4040
PROMPTS: dict[str, Any] = {}
4141

42-
PROMPTS["DEFAULT_LANGUAGE"] = "English"
4342
PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
4443
PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
4544
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"

aperag/graph/lightrag_manager.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from aperag.db.models import Collection
2323
from aperag.db.ops import db_ops
2424
from aperag.graph.lightrag import LightRAG
25+
from aperag.graph.lightrag.prompt import PROMPTS
2526
from aperag.graph.lightrag.utils import EmbeddingFunc
2627
from aperag.llm.embed.base_embedding import get_collection_embedding_service_sync
2728
from aperag.llm.llm_error_types import (
@@ -46,7 +47,6 @@ class LightRAGConfig:
4647
SUMMARY_TO_MAX_TOKENS = 2000
4748
FORCE_LLM_SUMMARY_ON_MERGE = 10
4849
EMBEDDING_MAX_TOKEN_SIZE = 8192
49-
# DEFAULT_LANGUAGE = "Simplified Chinese"
5050
DEFAULT_LANGUAGE = "The same language like input text"
5151

5252

@@ -76,6 +76,19 @@ async def create_lightrag_instance(collection: Collection) -> LightRAG:
7676
# Configure storage backends
7777
await _configure_storage_backends(kv_storage, vector_storage, graph_storage)
7878

79+
# Parse knowledge graph config from collection config
80+
from aperag.schema.utils import parseCollectionConfig
81+
82+
config = parseCollectionConfig(collection.config)
83+
kg_config = config.knowledge_graph_config
84+
language = LightRAGConfig.DEFAULT_LANGUAGE
85+
entity_types = PROMPTS["DEFAULT_ENTITY_TYPES"]
86+
if kg_config:
87+
if kg_config.language:
88+
language = kg_config.language
89+
if kg_config.entity_types:
90+
entity_types = kg_config.entity_types
91+
7992
# Create LightRAG instance
8093
rag = LightRAG(
8194
workspace=collection_id,
@@ -93,7 +106,8 @@ async def create_lightrag_instance(collection: Collection) -> LightRAG:
93106
entity_extract_max_gleaning=LightRAGConfig.ENTITY_EXTRACT_MAX_GLEANING,
94107
summary_to_max_tokens=LightRAGConfig.SUMMARY_TO_MAX_TOKENS,
95108
force_llm_summary_on_merge=LightRAGConfig.FORCE_LLM_SUMMARY_ON_MERGE,
96-
addon_params={"language": LightRAGConfig.DEFAULT_LANGUAGE},
109+
language=language,
110+
entity_types=entity_types,
97111
kv_storage=kv_storage,
98112
vector_storage=vector_storage,
99113
graph_storage=graph_storage,

aperag/schema/view_models.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
# generated by datamodel-codegen:
1616
# filename: openapi.merged.yaml
17-
# timestamp: 2025-11-11T02:52:34+00:00
17+
# timestamp: 2025-11-11T06:17:00+00:00
1818

1919
from __future__ import annotations
2020

@@ -66,6 +66,32 @@ class ModelSpec(BaseModel):
6666
)
6767

6868

69+
class KnowledgeGraphConfig(BaseModel):
70+
"""
71+
Configuration for knowledge graph generation
72+
"""
73+
74+
language: Optional[str] = Field(
75+
'English',
76+
description='Language for entity extraction and query responses',
77+
examples=['English'],
78+
)
79+
entity_types: Optional[list[str]] = Field(
80+
[
81+
'organization',
82+
'person',
83+
'geo',
84+
'event',
85+
'product',
86+
'technology',
87+
'date',
88+
'category',
89+
],
90+
description='List of entity types to extract during graph indexing',
91+
examples=[['organization', 'person', 'geo', 'event']],
92+
)
93+
94+
6995
class CollectionConfig(BaseModel):
7096
source: Optional[str] = Field(
7197
None, description='Source system identifier', examples=['system']
@@ -85,6 +111,23 @@ class CollectionConfig(BaseModel):
85111
enable_vision: Optional[bool] = Field(
86112
False, description='Whether to enable vision index'
87113
)
114+
knowledge_graph_config: Optional[KnowledgeGraphConfig] = Field(
115+
default_factory=lambda: KnowledgeGraphConfig.model_validate(
116+
{
117+
'language': 'English',
118+
'entity_types': [
119+
'organization',
120+
'person',
121+
'geo',
122+
'event',
123+
'product',
124+
'technology',
125+
'date',
126+
'category',
127+
],
128+
}
129+
)
130+
)
88131
embedding: Optional[ModelSpec] = None
89132
completion: Optional[ModelSpec] = None
90133
path: Optional[str] = Field(None, description='Path for local and ftp sources')

web/src/api/models/collection-config.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
*/
1414

1515

16+
// May contain unused imports in some cases
17+
// @ts-ignore
18+
import type { KnowledgeGraphConfig } from './knowledge-graph-config';
1619
// May contain unused imports in some cases
1720
// @ts-ignore
1821
import type { ModelSpec } from './model-spec';
@@ -59,6 +62,12 @@ export interface CollectionConfig {
5962
* @memberof CollectionConfig
6063
*/
6164
'enable_vision'?: boolean;
65+
/**
66+
*
67+
* @type {KnowledgeGraphConfig}
68+
* @memberof CollectionConfig
69+
*/
70+
'knowledge_graph_config'?: KnowledgeGraphConfig;
6271
/**
6372
*
6473
* @type {ModelSpec}

0 commit comments

Comments
 (0)