Skip to content

Commit b3a8222

Browse files
unidel2035claude
andcommitted
fix(builder): handle both dict and string formats for core_entities in knowledge unit extractor
Fix AttributeError when LLM returns core_entities as dict instead of string. The code now handles both formats: - String format (Chinese): "entity1,entity2,entity3" - Dict format (English): {"entity1": "Type1", "entity2": "Type2"} This resolves the issue where knowledge extraction would fail with: AttributeError: 'dict' object has no attribute 'split' Fixes #714 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent acb7361 commit b3a8222

4 files changed

Lines changed: 454 additions & 4 deletions

File tree

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Test script to verify the core_entities handling issue
4+
"""
5+
6+
# Simulate the scenario where core_entities can be either string or dict
7+
8+
def test_string_format():
9+
"""Test with string format (Chinese example)"""
10+
knowledge_value = {
11+
"content": "test content",
12+
"knowledgetype": "事实性知识",
13+
"core_entities": "火电发电量,同比增长率,2019年" # String format
14+
}
15+
16+
print("Testing STRING format:")
17+
print(f" core_entities type: {type(knowledge_value.get('core_entities'))}")
18+
print(f" core_entities value: {knowledge_value.get('core_entities')}")
19+
20+
try:
21+
# This is what the current code does
22+
for item in knowledge_value.get("core_entities", "").split(","):
23+
if not item.strip():
24+
continue
25+
print(f" - Entity: {item.strip()}")
26+
print(" ✓ SUCCESS: String format works")
27+
except AttributeError as e:
28+
print(f" ✗ FAILED: {e}")
29+
print()
30+
31+
def test_dict_format():
32+
"""Test with dict format (English example)"""
33+
knowledge_value = {
34+
"content": "test content",
35+
"knowledgetype": "Factual Knowledge",
36+
"core_entities": { # Dict format
37+
"T.I.": "Person",
38+
"No Mediocre": "Culture and Entertainment",
39+
"Paperwork": "Culture and Entertainment"
40+
}
41+
}
42+
43+
print("Testing DICT format (THIS WILL FAIL WITH CURRENT CODE):")
44+
print(f" core_entities type: {type(knowledge_value.get('core_entities'))}")
45+
print(f" core_entities value: {knowledge_value.get('core_entities')}")
46+
47+
try:
48+
# This is what the current code does - will fail!
49+
for item in knowledge_value.get("core_entities", "").split(","):
50+
if not item.strip():
51+
continue
52+
print(f" - Entity: {item.strip()}")
53+
print(" ✓ SUCCESS: Dict format works")
54+
except AttributeError as e:
55+
print(f" ✗ FAILED: {e}")
56+
print()
57+
58+
def test_fixed_approach():
59+
"""Test with fixed approach that handles both formats"""
60+
test_cases = [
61+
{
62+
"name": "String format",
63+
"knowledge_value": {
64+
"core_entities": "火电发电量,同比增长率,2019年"
65+
}
66+
},
67+
{
68+
"name": "Dict format",
69+
"knowledge_value": {
70+
"core_entities": {
71+
"T.I.": "Person",
72+
"No Mediocre": "Culture and Entertainment"
73+
}
74+
}
75+
},
76+
{
77+
"name": "Empty string",
78+
"knowledge_value": {
79+
"core_entities": ""
80+
}
81+
},
82+
{
83+
"name": "Missing field",
84+
"knowledge_value": {}
85+
}
86+
]
87+
88+
print("Testing FIXED approach that handles both formats:")
89+
for test_case in test_cases:
90+
print(f"\n Test: {test_case['name']}")
91+
knowledge_value = test_case['knowledge_value']
92+
core_entities = {}
93+
94+
try:
95+
core_entities_raw = knowledge_value.get("core_entities", "")
96+
97+
# Handle both string and dict formats
98+
if isinstance(core_entities_raw, dict):
99+
# Dict format: use as-is
100+
core_entities = core_entities_raw
101+
elif isinstance(core_entities_raw, str):
102+
# String format: parse comma-separated values
103+
for item in core_entities_raw.split(","):
104+
if item.strip():
105+
core_entities[item.strip()] = "Others"
106+
107+
print(f" Parsed entities: {core_entities}")
108+
print(f" ✓ SUCCESS")
109+
except Exception as e:
110+
print(f" ✗ FAILED: {e}")
111+
112+
if __name__ == "__main__":
113+
test_string_format()
114+
test_dict_format()
115+
test_fixed_approach()

experiments/test_fix.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Test script to verify the fix works correctly
4+
"""
5+
import sys
6+
import logging
7+
8+
# Set up logging
9+
logging.basicConfig(level=logging.WARNING)
10+
logger = logging.getLogger(__name__)
11+
12+
def simulate_fixed_code(knowledge_value):
13+
"""Simulates the fixed code logic"""
14+
core_entities = {}
15+
core_entities_raw = knowledge_value.get("core_entities", "")
16+
17+
# Handle both string and dict formats for core_entities
18+
if isinstance(core_entities_raw, dict):
19+
# Dict format: {entity_name: entity_type}
20+
core_entities = core_entities_raw
21+
elif isinstance(core_entities_raw, str):
22+
# String format: comma-separated values
23+
for item in core_entities_raw.split(","):
24+
if not item.strip():
25+
continue
26+
core_entities[item.strip()] = "Others"
27+
else:
28+
# Handle unexpected types gracefully
29+
logger.warning(
30+
f"Unexpected type for core_entities: {type(core_entities_raw)}, "
31+
f"expected str or dict. Value: {core_entities_raw}"
32+
)
33+
34+
return core_entities
35+
36+
def test_all_scenarios():
37+
"""Test all possible scenarios"""
38+
test_cases = [
39+
{
40+
"name": "Chinese format (string)",
41+
"knowledge_value": {
42+
"content": "2019年全国火电发电量51654亿千瓦时",
43+
"knowledgetype": "事实性知识",
44+
"core_entities": "火电发电量,同比增长率,2019年"
45+
},
46+
"expected": {
47+
"火电发电量": "Others",
48+
"同比增长率": "Others",
49+
"2019年": "Others"
50+
}
51+
},
52+
{
53+
"name": "English format (dict)",
54+
"knowledge_value": {
55+
"content": "No Mediocre is a song by T.I.",
56+
"knowledgetype": "Factual Knowledge",
57+
"core_entities": {
58+
"T.I.": "Person",
59+
"No Mediocre": "Culture and Entertainment",
60+
"Paperwork": "Culture and Entertainment",
61+
"DJ Mustard": "Person"
62+
}
63+
},
64+
"expected": {
65+
"T.I.": "Person",
66+
"No Mediocre": "Culture and Entertainment",
67+
"Paperwork": "Culture and Entertainment",
68+
"DJ Mustard": "Person"
69+
}
70+
},
71+
{
72+
"name": "Empty string",
73+
"knowledge_value": {
74+
"core_entities": ""
75+
},
76+
"expected": {}
77+
},
78+
{
79+
"name": "Missing field",
80+
"knowledge_value": {},
81+
"expected": {}
82+
},
83+
{
84+
"name": "String with extra spaces",
85+
"knowledge_value": {
86+
"core_entities": " entity1 , entity2 , entity3 "
87+
},
88+
"expected": {
89+
"entity1": "Others",
90+
"entity2": "Others",
91+
"entity3": "Others"
92+
}
93+
},
94+
{
95+
"name": "Invalid type (should log warning)",
96+
"knowledge_value": {
97+
"core_entities": 123
98+
},
99+
"expected": {}
100+
},
101+
{
102+
"name": "List type (should log warning)",
103+
"knowledge_value": {
104+
"core_entities": ["entity1", "entity2"]
105+
},
106+
"expected": {}
107+
}
108+
]
109+
110+
all_passed = True
111+
for i, test_case in enumerate(test_cases, 1):
112+
try:
113+
result = simulate_fixed_code(test_case["knowledge_value"])
114+
expected = test_case["expected"]
115+
116+
if result == expected:
117+
print(f"✓ Test {i}: {test_case['name']} - PASSED")
118+
else:
119+
print(f"✗ Test {i}: {test_case['name']} - FAILED")
120+
print(f" Expected: {expected}")
121+
print(f" Got: {result}")
122+
all_passed = False
123+
except Exception as e:
124+
print(f"✗ Test {i}: {test_case['name']} - EXCEPTION: {e}")
125+
all_passed = False
126+
127+
print()
128+
if all_passed:
129+
print("🎉 All tests passed!")
130+
return 0
131+
else:
132+
print("❌ Some tests failed!")
133+
return 1
134+
135+
if __name__ == "__main__":
136+
sys.exit(test_all_scenarios())

kag/builder/component/extractor/knowledge_unit_extractor.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -584,10 +584,24 @@ def triple_to_knowledge_unit(triple):
584584
{"name": knowledge_id, "category": "KnowledgeUnit"}
585585
)
586586
core_entities = {}
587-
for item in knowledge_value.get("core_entities", "").split(","):
588-
if not item.strip():
589-
continue
590-
core_entities[item.strip()] = "Others"
587+
core_entities_raw = knowledge_value.get("core_entities", "")
588+
589+
# Handle both string and dict formats for core_entities
590+
if isinstance(core_entities_raw, dict):
591+
# Dict format: {entity_name: entity_type}
592+
core_entities = core_entities_raw
593+
elif isinstance(core_entities_raw, str):
594+
# String format: comma-separated values
595+
for item in core_entities_raw.split(","):
596+
if not item.strip():
597+
continue
598+
core_entities[item.strip()] = "Others"
599+
else:
600+
# Handle unexpected types gracefully
601+
logger.warning(
602+
f"Unexpected type for core_entities: {type(core_entities_raw)}, "
603+
f"expected str or dict. Value: {core_entities_raw}"
604+
)
591605

592606
for core_entity, ent_type in core_entities.items():
593607
if core_entity == "":

0 commit comments

Comments
 (0)