fix: fix operators' registry

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 73f70a5f0df7 · 2025-12-11T17:16:01.000+08:00
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
@@ -1,7 +1,7 @@
 from .build_kg import BuildKGService
 from .chunk import ChunkService
 from .extract import extract
-from .generate import generate_qas
+from .generate import GenerateService
 from .judge import JudgeService
 from .partition import PartitionService
 from .quiz import QuizService
@@ -14,8 +14,8 @@
     "build_kg": BuildKGService,
     "quiz": QuizService,
     "judge": JudgeService,
-    "extract_info": extract,
-    "search_all": search_all,
+    "extract": extract,
+    "search": search_all,
     "partition": PartitionService,
-    "generate_qas": generate_qas,
+    "generate": GenerateService,
 }
diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py
@@ -1,3 +1,4 @@
+import json
 import os
 from typing import Iterable
 
@@ -149,7 +150,7 @@ def _attach_additional_data_to_node(self, batch: tuple) -> tuple:
                 if image_chunks:
                     # The generator expects a dictionary with an 'img_path' key, not a list of captions.
                     # We'll use the first image chunk found for this node.
-                    node_data["images"] = image_chunks[0]
+                    node_data["image_data"] = json.loads(image_chunks[0]["content"])
                     logger.debug("Attached image data to node %s", node_id)
 
         return nodes_data, edges_data
diff --git a/graphgen/operators/read/read.py b/graphgen/operators/read/read.py
@@ -85,7 +85,7 @@ def read(
         logger.info("[READ] Found %d files to process", len(all_files))
 
         if not all_files:
-            return ray.data.from_items([])
+            raise ValueError("No files found to read.")
 
         # 2. Group files by suffix to use appropriate reader
         files_by_suffix = {}
@@ -116,7 +116,7 @@ def read(
         combined_ds = combined_ds.map(
             lambda record: {
                 **record,
-                "_doc_id": compute_mm_hash(record),
+                "_doc_id": compute_mm_hash(record, prefix="doc-"),
             }
         )
 

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ def read(`
`85`	`85`	`logger.info("[READ] Found %d files to process", len(all_files))`
`86`	`86`
`87`	`87`	`if not all_files:`
`88`		`- return ray.data.from_items([])`
	`88`	`+ raise ValueError("No files found to read.")`
`89`	`89`
`90`	`90`	`# 2. Group files by suffix to use appropriate reader`
`91`	`91`	`files_by_suffix = {}`
`@@ -116,7 +116,7 @@ def read(`
`116`	`116`	`combined_ds = combined_ds.map(`
`117`	`117`	`lambda record: {`
`118`	`118`	`**record,`
`119`		`- "_doc_id": compute_mm_hash(record),`
	`119`	`+ "_doc_id": compute_mm_hash(record, prefix="doc-"),`
`120`	`120`	`}`
`121`	`121`	`)`
`122`	`122`