feat: add vqa_generator

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 22aae9acbbe0 · 2025-10-23T18:29:47.000+08:00
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -286,7 +286,10 @@ async def quiz_and_judge(self, quiz_and_judge_config: Dict):
     async def generate(self, partition_config: Dict, generate_config: Dict):
         # Step 1: partition the graph
         batches = await partition_kg(
-            self.graph_storage, self.tokenizer_instance, partition_config
+            self.graph_storage,
+            self.chunks_storage,
+            self.tokenizer_instance,
+            partition_config,
         )
 
         # Step 2： generate QA pairs
diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py
@@ -60,3 +60,79 @@ def parse_response(response: str) -> Any:
                 "answer": answer,
             }
         return qa_pairs
+
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        qa_pairs = self.parse_response(response)  # generate one or more QA pairs
+        nodes, _ = batch
+        for node in nodes:
+            node_data = node[1]
+            if "images" in node_data and node_data["images"]:
+                img_path = node_data["images"]
+                for qa in qa_pairs.values():
+                    qa["img_path"] = img_path
+        result.update(qa_pairs)
+        return result
+
+    @staticmethod
+    def format_generation_results(
+        results: list[dict], output_data_format: str
+    ) -> list[dict[str, Any]]:
+        if output_data_format == "Alpaca":
+            results = [
+                {
+                    "instruction": v["question"],
+                    "input": "",
+                    "output": v["answer"],
+                    "image": v.get("img_path", ""),
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "Sharegpt":
+            results = [
+                {
+                    "conversations": [
+                        {
+                            "from": "human",
+                            "value": [
+                                {"text": v["question"], "image": v.get("img_path", "")}
+                            ],
+                        },
+                        {"from": "gpt", "value": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "ChatML":
+            results = [
+                {
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"text": v["question"], "image": v.get("img_path", "")}
+                            ],
+                        },
+                        {"role": "assistant", "content": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        else:
+            raise ValueError(f"Unknown output data format: {output_data_format}")
+        return results
diff --git a/graphgen/operators/partition/partition_kg.py b/graphgen/operators/partition/partition_kg.py
@@ -1,6 +1,6 @@
 from typing import Any
 
-from graphgen.bases import BaseGraphStorage, BaseTokenizer
+from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseTokenizer
 from graphgen.models import (
     AnchorBFSPartitioner,
     BFSPartitioner,
@@ -15,6 +15,7 @@
 
 async def partition_kg(
     kg_instance: BaseGraphStorage,
+    chunk_storage: BaseKVStorage,
     tokenizer: Any = BaseTokenizer,
     partition_config: dict = None,
 ) -> list[
@@ -54,4 +55,14 @@ async def partition_kg(
     communities = await partitioner.partition(g=kg_instance, **method_params)
     logger.info("Partitioned the graph into %d communities.", len(communities))
     batches = await partitioner.community2batch(communities, g=kg_instance)
+
+    for _, batch in enumerate(batches):
+        nodes, edges = batch
+        for node_id, node_data in nodes:
+            entity_type = node_data.get("entity_type")
+            if "image" in entity_type.lower():
+                node_id = node_id.strip('"').lower()
+                image_data = await chunk_storage.get_by_id(node_id)
+                if image_data:
+                    node_data["images"] = image_data
     return batches