fix(builder): handle both dict and string formats for core_entities in knowledge unit extractor

unidel2035 · claude · unidel2035 · commit b3a8222e6051 · 2025-11-01T16:19:04.000Z
Fix AttributeError when LLM returns core_entities as dict instead of string. The code now handles both formats: - String format (Chinese): "entity1,entity2,entity3" - Dict format (English): {"entity1": "Type1", "entity2": "Type2"} This resolves the issue where knowledge extraction would fail with: AttributeError: 'dict' object has no attribute 'split' Fixes #714 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/experiments/test_core_entities_handling.py b/experiments/test_core_entities_handling.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the core_entities handling issue
+"""
+
+# Simulate the scenario where core_entities can be either string or dict
+
+def test_string_format():
+    """Test with string format (Chinese example)"""
+    knowledge_value = {
+        "content": "test content",
+        "knowledgetype": "事实性知识",
+        "core_entities": "火电发电量,同比增长率,2019年"  # String format
+    }
+
+    print("Testing STRING format:")
+    print(f"  core_entities type: {type(knowledge_value.get('core_entities'))}")
+    print(f"  core_entities value: {knowledge_value.get('core_entities')}")
+
+    try:
+        # This is what the current code does
+        for item in knowledge_value.get("core_entities", "").split(","):
+            if not item.strip():
+                continue
+            print(f"  - Entity: {item.strip()}")
+        print("  ✓ SUCCESS: String format works")
+    except AttributeError as e:
+        print(f"  ✗ FAILED: {e}")
+    print()
+
+def test_dict_format():
+    """Test with dict format (English example)"""
+    knowledge_value = {
+        "content": "test content",
+        "knowledgetype": "Factual Knowledge",
+        "core_entities": {  # Dict format
+            "T.I.": "Person",
+            "No Mediocre": "Culture and Entertainment",
+            "Paperwork": "Culture and Entertainment"
+        }
+    }
+
+    print("Testing DICT format (THIS WILL FAIL WITH CURRENT CODE):")
+    print(f"  core_entities type: {type(knowledge_value.get('core_entities'))}")
+    print(f"  core_entities value: {knowledge_value.get('core_entities')}")
+
+    try:
+        # This is what the current code does - will fail!
+        for item in knowledge_value.get("core_entities", "").split(","):
+            if not item.strip():
+                continue
+            print(f"  - Entity: {item.strip()}")
+        print("  ✓ SUCCESS: Dict format works")
+    except AttributeError as e:
+        print(f"  ✗ FAILED: {e}")
+    print()
+
+def test_fixed_approach():
+    """Test with fixed approach that handles both formats"""
+    test_cases = [
+        {
+            "name": "String format",
+            "knowledge_value": {
+                "core_entities": "火电发电量,同比增长率,2019年"
+            }
+        },
+        {
+            "name": "Dict format",
+            "knowledge_value": {
+                "core_entities": {
+                    "T.I.": "Person",
+                    "No Mediocre": "Culture and Entertainment"
+                }
+            }
+        },
+        {
+            "name": "Empty string",
+            "knowledge_value": {
+                "core_entities": ""
+            }
+        },
+        {
+            "name": "Missing field",
+            "knowledge_value": {}
+        }
+    ]
+
+    print("Testing FIXED approach that handles both formats:")
+    for test_case in test_cases:
+        print(f"\n  Test: {test_case['name']}")
+        knowledge_value = test_case['knowledge_value']
+        core_entities = {}
+
+        try:
+            core_entities_raw = knowledge_value.get("core_entities", "")
+
+            # Handle both string and dict formats
+            if isinstance(core_entities_raw, dict):
+                # Dict format: use as-is
+                core_entities = core_entities_raw
+            elif isinstance(core_entities_raw, str):
+                # String format: parse comma-separated values
+                for item in core_entities_raw.split(","):
+                    if item.strip():
+                        core_entities[item.strip()] = "Others"
+
+            print(f"    Parsed entities: {core_entities}")
+            print(f"    ✓ SUCCESS")
+        except Exception as e:
+            print(f"    ✗ FAILED: {e}")
+
+if __name__ == "__main__":
+    test_string_format()
+    test_dict_format()
+    test_fixed_approach()
diff --git a/experiments/test_fix.py b/experiments/test_fix.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the fix works correctly
+"""
+import sys
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.WARNING)
+logger = logging.getLogger(__name__)
+
+def simulate_fixed_code(knowledge_value):
+    """Simulates the fixed code logic"""
+    core_entities = {}
+    core_entities_raw = knowledge_value.get("core_entities", "")
+
+    # Handle both string and dict formats for core_entities
+    if isinstance(core_entities_raw, dict):
+        # Dict format: {entity_name: entity_type}
+        core_entities = core_entities_raw
+    elif isinstance(core_entities_raw, str):
+        # String format: comma-separated values
+        for item in core_entities_raw.split(","):
+            if not item.strip():
+                continue
+            core_entities[item.strip()] = "Others"
+    else:
+        # Handle unexpected types gracefully
+        logger.warning(
+            f"Unexpected type for core_entities: {type(core_entities_raw)}, "
+            f"expected str or dict. Value: {core_entities_raw}"
+        )
+
+    return core_entities
+
+def test_all_scenarios():
+    """Test all possible scenarios"""
+    test_cases = [
+        {
+            "name": "Chinese format (string)",
+            "knowledge_value": {
+                "content": "2019年全国火电发电量51654亿千瓦时",
+                "knowledgetype": "事实性知识",
+                "core_entities": "火电发电量,同比增长率,2019年"
+            },
+            "expected": {
+                "火电发电量": "Others",
+                "同比增长率": "Others",
+                "2019年": "Others"
+            }
+        },
+        {
+            "name": "English format (dict)",
+            "knowledge_value": {
+                "content": "No Mediocre is a song by T.I.",
+                "knowledgetype": "Factual Knowledge",
+                "core_entities": {
+                    "T.I.": "Person",
+                    "No Mediocre": "Culture and Entertainment",
+                    "Paperwork": "Culture and Entertainment",
+                    "DJ Mustard": "Person"
+                }
+            },
+            "expected": {
+                "T.I.": "Person",
+                "No Mediocre": "Culture and Entertainment",
+                "Paperwork": "Culture and Entertainment",
+                "DJ Mustard": "Person"
+            }
+        },
+        {
+            "name": "Empty string",
+            "knowledge_value": {
+                "core_entities": ""
+            },
+            "expected": {}
+        },
+        {
+            "name": "Missing field",
+            "knowledge_value": {},
+            "expected": {}
+        },
+        {
+            "name": "String with extra spaces",
+            "knowledge_value": {
+                "core_entities": " entity1 , entity2  ,  entity3  "
+            },
+            "expected": {
+                "entity1": "Others",
+                "entity2": "Others",
+                "entity3": "Others"
+            }
+        },
+        {
+            "name": "Invalid type (should log warning)",
+            "knowledge_value": {
+                "core_entities": 123
+            },
+            "expected": {}
+        },
+        {
+            "name": "List type (should log warning)",
+            "knowledge_value": {
+                "core_entities": ["entity1", "entity2"]
+            },
+            "expected": {}
+        }
+    ]
+
+    all_passed = True
+    for i, test_case in enumerate(test_cases, 1):
+        try:
+            result = simulate_fixed_code(test_case["knowledge_value"])
+            expected = test_case["expected"]
+
+            if result == expected:
+                print(f"✓ Test {i}: {test_case['name']} - PASSED")
+            else:
+                print(f"✗ Test {i}: {test_case['name']} - FAILED")
+                print(f"  Expected: {expected}")
+                print(f"  Got:      {result}")
+                all_passed = False
+        except Exception as e:
+            print(f"✗ Test {i}: {test_case['name']} - EXCEPTION: {e}")
+            all_passed = False
+
+    print()
+    if all_passed:
+        print("🎉 All tests passed!")
+        return 0
+    else:
+        print("❌ Some tests failed!")
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(test_all_scenarios())
diff --git a/kag/builder/component/extractor/knowledge_unit_extractor.py b/kag/builder/component/extractor/knowledge_unit_extractor.py
@@ -584,10 +584,24 @@ def triple_to_knowledge_unit(triple):
                 {"name": knowledge_id, "category": "KnowledgeUnit"}
             )
             core_entities = {}
-            for item in knowledge_value.get("core_entities", "").split(","):
-                if not item.strip():
-                    continue
-                core_entities[item.strip()] = "Others"
+            core_entities_raw = knowledge_value.get("core_entities", "")
+
+            # Handle both string and dict formats for core_entities
+            if isinstance(core_entities_raw, dict):
+                # Dict format: {entity_name: entity_type}
+                core_entities = core_entities_raw
+            elif isinstance(core_entities_raw, str):
+                # String format: comma-separated values
+                for item in core_entities_raw.split(","):
+                    if not item.strip():
+                        continue
+                    core_entities[item.strip()] = "Others"
+            else:
+                # Handle unexpected types gracefully
+                logger.warning(
+                    f"Unexpected type for core_entities: {type(core_entities_raw)}, "
+                    f"expected str or dict. Value: {core_entities_raw}"
+                )
 
             for core_entity, ent_type in core_entities.items():
                 if core_entity == "":
diff --git a/tests/unit/builder/component/test_knowledge_unit_core_entities.py b/tests/unit/builder/component/test_knowledge_unit_core_entities.py