refactor: refactor quiz&judge to ray actors

ChenZiHong-Gavin · ChenZiHong-Gavin · commit c9435d795cf9 · 2025-12-10T15:18:04.000+08:00
diff --git a/graphgen/operators/generate/generate.py b/graphgen/operators/generate/generate.py
diff --git a/graphgen/operators/judge/judge_service.py b/graphgen/operators/judge/judge_service.py
@@ -1,42 +1,76 @@
 import math
 
-import gradio as gr
-
-from graphgen.bases import BaseLLMWrapper
-from graphgen.models import JsonKVStorage, NetworkXStorage
-from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
-from graphgen.utils import logger, run_concurrent, yes_no_loss_entropy
-
-
-import math
-from collections.abc import Iterable
-
 import pandas as pd
 
-from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseLLMWrapper
+from graphgen.bases import BaseGraphStorage, BaseLLMWrapper
 from graphgen.common import init_llm, init_storage
-from graphgen.models import NetworkXStorage, JsonKVStorage
 from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
 from graphgen.utils import logger, run_concurrent, yes_no_loss_entropy
 
 
 class JudgeService:
     """Service for judging graph edges and nodes using a trainee LLM."""
+
     def __init__(self, working_dir: str = "cache"):
         self.llm_client: BaseLLMWrapper = init_llm("trainee")
+        self.graph_storage: BaseGraphStorage = init_storage(
+            backend="networkx",
+            working_dir=working_dir,
+            namespace="graph",
+        )
 
     def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
+        items = batch.to_dict(orient="records")
+        self.graph_storage.reload()
+        self.judge(items)
         return pd.DataFrame([{"status": "judging_completed"}])
 
-    def judge(self) -> Iterable[pd.DataFrame]:
-        """
-        Judge the statements in the graph storage
+    async def _process_single_judge(self, item: dict) -> dict:
+        description = item["description"]
+        try:
+            judgement = await self.llm_client.generate_topk_per_token(
+                STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(statement=description)
+            )
+            top_candidates = judgement[0].top_candidates
+            gt = item.get("ground_truth", "yes")
+            loss = yes_no_loss_entropy([top_candidates], [gt])
+            logger.debug("Description: %s Loss: %s", description, loss)
+            item["loss"] = loss
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error in judging description: %s", e)
+            logger.info("Use default loss 0.1")
+            item["loss"] = -math.log(0.1)
+        return item
 
-        :param re_judge: re-judge the relations
-        :return:
+    def judge(self, items: list[dict]) -> None:
+        """
+        Judge the description in the item and compute the loss.
         """
-        return
+        results = run_concurrent(
+            self._process_single_judge,
+            items,
+            desc="Judging descriptions",
+            unit="description",
+        )
 
+        # Update the graph storage with the computed losses
+        for item in results:
+            print(item)
+            node_id = item.get("node_id")
+            edge_source = item.get("edge_source")
+            edge_target = item.get("edge_target")
+            loss = item["loss"]
+            if node_id is not None:
+                node_data = self.graph_storage.get_node(node_id)
+                if node_data is not None:
+                    node_data["loss"] = loss
+                    self.graph_storage.update_node(node_id, node_data)
+            elif edge_source is not None and edge_target is not None:
+                edge_data = self.graph_storage.get_edge(edge_source, edge_target)
+                if edge_data is not None:
+                    edge_data["loss"] = loss
+                    self.graph_storage.update_edge(edge_source, edge_target, edge_data)
+        self.graph_storage.index_done_callback()
 
 
 # async def judge_statement(  # pylint: disable=too-many-statements
diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py
@@ -10,10 +10,8 @@
 )
 from graphgen.utils import logger
 
-from .pre_tokenize import pre_tokenize
 
-
-async def partition_kg(
+def partition_kg(
     kg_instance: BaseGraphStorage,
     chunk_storage: BaseKVStorage,
     tokenizer: Any = BaseTokenizer,
@@ -60,7 +58,7 @@ async def partition_kg(
     return batches
 
 
-async def attach_additional_data_to_node(
+def attach_additional_data_to_node(
     batches: list[
         tuple[
             list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
@@ -112,3 +110,61 @@ async def _attach_by_type(
             # We'll use the first image chunk found for this node.
             node_data["images"] = image_chunks[0]
             logger.debug("Attached image data to node %s", node_id)
+
+
+import asyncio
+from typing import List, Tuple
+
+import gradio as gr
+
+from graphgen.bases import BaseGraphStorage, BaseTokenizer
+from graphgen.utils import run_concurrent
+
+
+async def pre_tokenize(
+    graph_storage: BaseGraphStorage,
+    tokenizer: BaseTokenizer,
+    edges: List[Tuple],
+    nodes: List[Tuple],
+    progress_bar: gr.Progress = None,
+    max_concurrent: int = 1000,
+) -> Tuple[List, List]:
+    """为 edges/nodes 补 token-length 并回写存储，并发 1000，带进度条。"""
+    sem = asyncio.Semaphore(max_concurrent)
+
+    async def _patch_and_write(obj: Tuple, *, is_node: bool) -> Tuple:
+        async with sem:
+            data = obj[1] if is_node else obj[2]
+            if "length" not in data:
+                loop = asyncio.get_event_loop()
+                data["length"] = len(
+                    await loop.run_in_executor(
+                        None, tokenizer.encode, data["description"]
+                    )
+                )
+            if is_node:
+                graph_storage.update_node(obj[0], obj[1])
+            else:
+                graph_storage.update_edge(obj[0], obj[1], obj[2])
+            return obj
+
+    new_edges, new_nodes = await asyncio.gather(
+        run_concurrent(
+            lambda e: _patch_and_write(e, is_node=False),
+            edges,
+            desc="Pre-tokenizing edges",
+            unit="edge",
+            progress_bar=progress_bar,
+        ),
+        run_concurrent(
+            lambda n: _patch_and_write(n, is_node=True),
+            nodes,
+            desc="Pre-tokenizing nodes",
+            unit="node",
+            progress_bar=progress_bar,
+        ),
+    )
+
+    graph_storage.index_done_callback()
+    return new_edges, new_nodes
+
diff --git a/graphgen/operators/partition/pre_tokenize.py b/graphgen/operators/partition/pre_tokenize.py
diff --git a/graphgen/operators/quiz/quiz_service.py b/graphgen/operators/quiz/quiz_service.py
@@ -5,11 +5,16 @@
 from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseLLMWrapper
 from graphgen.common import init_llm, init_storage
 from graphgen.models import QuizGenerator
-from graphgen.utils import compute_content_hash, run_concurrent, logger
+from graphgen.utils import compute_content_hash, logger, run_concurrent
 
 
 class QuizService:
-    def __init__(self, working_dir: str = "cache", quiz_samples: int = 1, concurrency_limit: int = 200):
+    def __init__(
+        self,
+        working_dir: str = "cache",
+        quiz_samples: int = 1,
+        concurrency_limit: int = 200,
+    ):
         self.quiz_samples = quiz_samples
         self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
         self.graph_storage: BaseGraphStorage = init_storage(
@@ -20,7 +25,6 @@ def __init__(self, working_dir: str = "cache", quiz_samples: int = 1, concurrenc
             backend="json_kv", working_dir=working_dir, namespace="quiz"
         )
         self.generator = QuizGenerator(self.llm_client)
-
         self.concurrency_limit = concurrency_limit
 
     def __call__(self, batch: pd.DataFrame) -> Iterable[pd.DataFrame]:
@@ -80,7 +84,6 @@ def quiz(self) -> Iterable[pd.DataFrame]:
             description = node_data["description"]
             items.append(description)
 
-        print("Total descriptions to quiz: %d", len(items))
         logger.info("Total descriptions to quiz: %d", len(items))
 
         for i in range(0, len(items), self.concurrency_limit):
diff --git a/requirements.txt b/requirements.txt
@@ -21,6 +21,8 @@ fastapi
 trafilatura
 aiohttp
 socksio
+pydantic
+ray==2.52.1
 
 leidenalg
 igraph