fix: fix quiz params

ChenZiHong-Gavin · ChenZiHong-Gavin · commit bc07222ca79d · 2025-12-05T16:01:20.000+08:00
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
@@ -6,12 +6,14 @@
 from .quiz import QuizService
 from .read import read
 from .search import search_all
+from .judge import JudgeService
 
 operators = {
     "read": read,
     "chunk": ChunkService,
     "build_kg": BuildKGService,
     "quiz": QuizService,
+    "judge": JudgeService,
     "extract_info": extract_info,
     "search_all": search_all,
     "partition_kg": partition_kg,
diff --git a/graphgen/operators/chunk/chunk_service.py b/graphgen/operators/chunk/chunk_service.py
@@ -4,6 +4,7 @@
 
 import pandas as pd
 
+from graphgen.common import init_storage
 from graphgen.models import (
     ChineseRecursiveTextSplitter,
     RecursiveCharacterSplitter,
@@ -40,9 +41,14 @@ def split_chunks(text: str, language: str = "en", **kwargs) -> list:
 
 
 class ChunkService:
-    def __init__(self, **chunk_kwargs):
+    def __init__(self, working_dir: str = "cache", **chunk_kwargs):
         tokenizer_model = os.getenv("TOKENIZER_MODEL", "cl100k_base")
         self.tokenizer_instance: Tokenizer = Tokenizer(model_name=tokenizer_model)
+        self.chunk_storage = init_storage(
+            backend="json_kv",
+            working_dir=working_dir,
+            namespace="chunk",
+        )
         self.chunk_kwargs = chunk_kwargs
 
     def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
@@ -88,4 +94,8 @@ def chunk_documents(self, new_docs: list) -> list:
                         **doc,
                     }
                 )
+        self.chunk_storage.upsert(
+            {chunk["_chunk_id"]: chunk for chunk in chunks}
+        )
+        self.chunk_storage.index_done_callback()
         return chunks
diff --git a/graphgen/operators/extract/__init__.py b/graphgen/operators/extract/__init__.py
@@ -1 +1 @@
-from .extract_info import extract_info
+from .extract import extract_info
diff --git a/graphgen/operators/extract/extract.py b/graphgen/operators/extract/extract.py
diff --git a/graphgen/operators/judge/__init__.py b/graphgen/operators/judge/__init__.py
@@ -0,0 +1 @@
+from .judge_service import JudgeService
diff --git a/graphgen/operators/judge/judge.py b/graphgen/operators/judge/judge.py
diff --git a/graphgen/operators/judge/judge_service.py b/graphgen/operators/judge/judge_service.py
@@ -0,0 +1,170 @@
+import math
+
+import gradio as gr
+
+from graphgen.bases import BaseLLMWrapper
+from graphgen.models import JsonKVStorage, NetworkXStorage
+from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
+from graphgen.utils import logger, run_concurrent, yes_no_loss_entropy
+
+
+import math
+from collections.abc import Iterable
+
+import pandas as pd
+
+from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseLLMWrapper
+from graphgen.common import init_llm, init_storage
+from graphgen.models import NetworkXStorage, JsonKVStorage
+from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
+from graphgen.utils import logger, run_concurrent, yes_no_loss_entropy
+
+
+class JudgeService:
+    """Service for judging graph edges and nodes using a trainee LLM."""
+    def __init__(self, working_dir: str = "cache"):
+        self.llm_client: BaseLLMWrapper = init_llm("trainee")
+
+    def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
+        return pd.DataFrame([{"status": "judging_completed"}])
+
+    def judge(self) -> Iterable[pd.DataFrame]:
+        """
+        Judge the statements in the graph storage
+
+        :param re_judge: re-judge the relations
+        :return:
+        """
+        return
+
+
+
+# async def judge_statement(  # pylint: disable=too-many-statements
+#     trainee_llm_client: BaseLLMWrapper,
+#     graph_storage: NetworkXStorage,
+#     rephrase_storage: JsonKVStorage,
+#     re_judge: bool = False,
+#     progress_bar: gr.Progress = None,
+# ) -> NetworkXStorage:
+#     """
+#     Get all edges and nodes and judge them
+#
+#     :param trainee_llm_client: judge the statements to get comprehension loss
+#     :param graph_storage: graph storage instance
+#     :param rephrase_storage: rephrase storage instance
+#     :param re_judge: re-judge the relations
+#     :param progress_bar
+#     :return:
+#     """
+#
+#     async def _judge_single_relation(
+#         edge: tuple,
+#     ):
+#         source_id = edge[0]
+#         target_id = edge[1]
+#         edge_data = edge[2]
+#
+#         if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
+#             logger.debug(
+#                 "Edge %s -> %s already judged, loss: %s, skip",
+#                 source_id,
+#                 target_id,
+#                 edge_data["loss"],
+#             )
+#             return source_id, target_id, edge_data
+#
+#         description = edge_data["description"]
+#
+#         try:
+#             descriptions = rephrase_storage.get_by_id(description)
+#             assert descriptions is not None
+#
+#             judgements = []
+#             gts = [gt for _, gt in descriptions]
+#             for description, gt in descriptions:
+#                 judgement = await trainee_llm_client.generate_topk_per_token(
+#                     STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(statement=description)
+#                 )
+#                 judgements.append(judgement[0].top_candidates)
+#
+#             loss = yes_no_loss_entropy(judgements, gts)
+#
+#             logger.debug(
+#                 "Edge %s -> %s description: %s loss: %s",
+#                 source_id,
+#                 target_id,
+#                 description,
+#                 loss,
+#             )
+#
+#             edge_data["loss"] = loss
+#         except Exception as e:  # pylint: disable=broad-except
+#             logger.error(
+#                 "Error in judging relation %s -> %s: %s", source_id, target_id, e
+#             )
+#             logger.info("Use default loss 0.1")
+#             edge_data["loss"] = -math.log(0.1)
+#
+#         graph_storage.update_edge(source_id, target_id, edge_data)
+#         return source_id, target_id, edge_data
+#
+#     edges = graph_storage.get_all_edges()
+#
+#     await run_concurrent(
+#         _judge_single_relation,
+#         edges,
+#         desc="Judging relations",
+#         unit="relation",
+#         progress_bar=progress_bar,
+#     )
+#
+#     async def _judge_single_entity(
+#         node: tuple,
+#     ):
+#         node_id = node[0]
+#         node_data = node[1]
+#
+#         if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
+#             logger.debug(
+#                 "Node %s already judged, loss: %s, skip", node_id, node_data["loss"]
+#             )
+#             return node_id, node_data
+#
+#         description = node_data["description"]
+#
+#         try:
+#             descriptions = rephrase_storage.get_by_id(description)
+#             assert descriptions is not None
+#
+#             judgements = []
+#             gts = [gt for _, gt in descriptions]
+#             for description, gt in descriptions:
+#                 judgement = await trainee_llm_client.generate_topk_per_token(
+#                     STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(statement=description)
+#                 )
+#                 judgements.append(judgement[0].top_candidates)
+#
+#             loss = yes_no_loss_entropy(judgements, gts)
+#
+#             logger.debug("Node %s description: %s loss: %s", node_id, description, loss)
+#
+#             node_data["loss"] = loss
+#         except Exception as e:  # pylint: disable=broad-except
+#             logger.error("Error in judging entity %s: %s", node_id, e)
+#             logger.error("Use default loss 0.1")
+#             node_data["loss"] = -math.log(0.1)
+#
+#         graph_storage.update_node(node_id, node_data)
+#         return node_id, node_data
+#
+#     nodes = graph_storage.get_all_nodes()
+#
+#     await run_concurrent(
+#         _judge_single_entity,
+#         nodes,
+#         desc="Judging entities",
+#         unit="entity",
+#         progress_bar=progress_bar,
+#     )
+#
+#     return graph_storage
diff --git a/graphgen/operators/quiz/__init__.py b/graphgen/operators/quiz/__init__.py
@@ -1 +1 @@
-from .quiz import QuizService
+from .quiz_service import QuizService
diff --git a/graphgen/operators/quiz/quiz_service.py b/graphgen/operators/quiz/quiz_service.py
@@ -5,11 +5,11 @@
 from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseLLMWrapper
 from graphgen.common import init_llm, init_storage
 from graphgen.models import QuizGenerator
-from graphgen.utils import compute_content_hash, logger, run_concurrent
+from graphgen.utils import compute_content_hash, run_concurrent, logger
 
 
 class QuizService:
-    def __init__(self, working_dir: str = "cache", quiz_samples: int = 1):
+    def __init__(self, working_dir: str = "cache", quiz_samples: int = 1, concurrency_limit: int = 200):
         self.quiz_samples = quiz_samples
         self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
         self.graph_storage: BaseGraphStorage = init_storage(
@@ -21,7 +21,7 @@ def __init__(self, working_dir: str = "cache", quiz_samples: int = 1):
         )
         self.generator = QuizGenerator(self.llm_client)
 
-        self.concurrency_limit = 20
+        self.concurrency_limit = concurrency_limit
 
     def __call__(self, batch: pd.DataFrame) -> Iterable[pd.DataFrame]:
         # this operator does not consume any batch data
@@ -80,6 +80,7 @@ def quiz(self) -> Iterable[pd.DataFrame]:
             description = node_data["description"]
             items.append(description)
 
+        print("Total descriptions to quiz: %d", len(items))
         logger.info("Total descriptions to quiz: %d", len(items))
 
         for i in range(0, len(items), self.concurrency_limit):

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .extract_info import extract_info`
	`1`	`+from .extract import extract_info`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .judge_service import JudgeService`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .quiz import QuizService`
	`1`	`+from .quiz_service import QuizService`