feat: add kg_accuracy & kg_consistency metrics

ChenZiHong-Gavin · ChenZiHong-Gavin · commit e10b3917ccc0 · 2025-12-26T16:24:38.000+08:00
diff --git a/examples/evaluate/evaluate_kg/kg_evaluation_config.yaml b/examples/evaluate/evaluate_kg/kg_evaluation_config.yaml
@@ -41,5 +41,5 @@ nodes:
     params:
       metrics:
         - kg_structure
-#        - kg_accuracy
-#        - kg_consistency
+        - kg_accuracy
+        - kg_consistency
diff --git a/graphgen/models/evaluator/kg/accuracy_evaluator.py b/graphgen/models/evaluator/kg/accuracy_evaluator.py
@@ -279,8 +279,9 @@ def _evaluate_relation_extraction(
                 "issues": [f"Evaluation error: {str(e)}"],
             }
 
+    @staticmethod
     def _aggregate_evaluation_results(
-        self, entity_evaluations: List[Dict], relation_evaluations: List[Dict]
+        entity_evaluations: List[Dict], relation_evaluations: List[Dict]
     ) -> Dict[str, Any]:
         """Aggregate evaluation results from all chunks."""
 
diff --git a/graphgen/operators/evaluate/evaluate_kg.py b/graphgen/operators/evaluate/evaluate_kg.py
diff --git a/graphgen/operators/evaluate/evaluate_service.py b/graphgen/operators/evaluate/evaluate_service.py
@@ -18,15 +18,19 @@ def __init__(
         working_dir: str = "cache",
         metrics: list[str] = None,
         graph_backend: str = "kuzu",
+        kv_backend: str = "rocksdb",
         **kwargs,
     ):
         super().__init__(working_dir=working_dir, op_name="evaluate_service")
         self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
         self.metrics = metrics or []
         self.kwargs = kwargs
-        self.graph_backend = init_storage(
+        self.graph_storage = init_storage(
             backend=graph_backend, working_dir=working_dir, namespace="graph"
         )
+        self.chunk_storage = init_storage(
+            backend=kv_backend, working_dir=working_dir, namespace="chunk"
+        )
 
         # Initialize evaluators
         self.qa_evaluators = {}
@@ -62,21 +66,23 @@ def _init_evaluators(self):
                 from graphgen.models import AccuracyEvaluator
 
                 self.kg_evaluators[metric] = AccuracyEvaluator(
-                    graph_storage=self.graph_backend,
-                    **self.kwargs.get("accuracy_params", {}),
+                    graph_storage=self.graph_storage,
+                    chunk_storage=self.chunk_storage,
+                    llm_client=self.llm_client,
                 )
             elif metric == "kg_consistency":
                 from graphgen.models import ConsistencyEvaluator
 
                 self.kg_evaluators[metric] = ConsistencyEvaluator(
-                    graph_storage=self.graph_backend,
-                    **self.kwargs.get("consistency_params", {}),
+                    graph_storage=self.graph_storage,
+                    chunk_storage=self.chunk_storage,
+                    llm_client=self.llm_client,
                 )
             elif metric == "kg_structure":
                 from graphgen.models import StructureEvaluator
 
                 self.kg_evaluators[metric] = StructureEvaluator(
-                    graph_storage=self.graph_backend,
+                    graph_storage=self.graph_storage,
                     **self.kwargs.get("structure_params", {}),
                 )
             else:

Original file line number	Diff line number	Diff line change
`@@ -279,8 +279,9 @@ def _evaluate_relation_extraction(`
`279`	`279`	`"issues": [f"Evaluation error: {str(e)}"],`
`280`	`280`	`}`
`281`	`281`
	`282`	`+ @staticmethod`
`282`	`283`	`def _aggregate_evaluation_results(`
`283`		`- self, entity_evaluations: List[Dict], relation_evaluations: List[Dict]`
	`284`	`+ entity_evaluations: List[Dict], relation_evaluations: List[Dict]`
`284`	`285`	`) -> Dict[str, Any]:`
`285`	`286`	`"""Aggregate evaluation results from all chunks."""`
`286`	`287`