InternScience
diff --git a/‎README.md‎
Lines changed: 10 additions & 1 deletion b/‎README.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎README_ZH.md‎ ‎README_zh.md‎README_ZH.md renamed to README_zh.md
Lines changed: 10 additions & 1 deletion b/‎README_ZH.md‎ ‎README_zh.md‎README_ZH.md renamed to README_zh.md
Lines changed: 10 additions & 1 deletion
diff --git a/‎graphgen/bases/base_reader.py‎
Lines changed: 45 additions & 0 deletions b/‎graphgen/bases/base_reader.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎graphgen/bases/datatypes.py‎
Lines changed: 10 additions & 0 deletions b/‎graphgen/bases/datatypes.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphgen/configs/cot_config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎graphgen/configs/cot_config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphgen/configs/vqa_config.yaml‎
Lines changed: 18 additions & 0 deletions b/‎graphgen/configs/vqa_config.yaml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎graphgen/generate.py‎
Lines changed: 5 additions & 18 deletions b/‎graphgen/generate.py‎
Lines changed: 5 additions & 18 deletions
@@ -21,7 +21,7 @@
 
 GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation
 
-[English](README.md) | [中文](README_ZH.md)
+[English](README.md) | [中文](README_zh)
 
 <details close>
 <summary><b>📚 Table of Contents</b></summary>
@@ -62,11 +62,20 @@ After data generation, you can use [LLaMA-Factory](https://github.com/hiyouga/LL
 
 ## 📌 Latest Updates
 
+- **2025.10.23**: We support VQA(Visual Question Answering) data generation now. Run script: `bash scripts/generate/generate_vqa.sh`.
+- **2025.10.21**: We support PDF as input format for data generation now via [MinerU](https://github.com/opendatalab/MinerU).
 - **2025.09.29**: We auto-update gradio demo on [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) and [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen).
+
+<details>
+<summary>History</summary>
+
 - **2025.08.14**: We have added support for community detection in knowledge graphs using the Leiden algorithm, enabling the synthesis of Chain-of-Thought (CoT) data.
 - **2025.07.31**: We have added Google, Bing, Wikipedia, and UniProt as search back-ends.
 - **2025.04.21**: We have released the initial version of GraphGen.
 
+</details>
+
+
 ## 🚀 Quick Start
 
 Experience GraphGen through [Web](https://g-app-center-120612-6433-jpdvmvp.openxlab.space) or [Backup Web Entrance](https://openxlab.org.cn/apps/detail/chenzihonga/GraphGen)
 
@@ -20,7 +20,7 @@
 
 GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation
 
-[English](README.md) | [中文](README_ZH.md)
+[English](README.md) | [中文](README_zh)
 
 <details close>
 <summary><b>📚 目录</b></summary>
@@ -63,11 +63,20 @@ GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期
 
 ## 📌 最新更新
 
+- **2025.10.23**：我们现在支持视觉问答（VQA）数据生成。运行脚本：`bash scripts/generate/generate_vqa.sh`。
+- **2025.10.21**：我们现在通过 [MinerU](https://github.com/opendatalab/MinerU) 支持 PDF 作为数据生成的输入格式。
 - **2025.09.29**：我们在 [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) 和 [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen) 上自动更新 Gradio 应用。
+
+<details>
+<summary>历史更新</summary>
+
 - **2025.08.14**：支持利用 Leiden 社区发现算法对知识图谱进行社区划分，合成 CoT 数据。
 - **2025.07.31**：新增 Google、Bing、Wikipedia 和 UniProt 作为搜索后端，帮助填补数据缺口。  
 - **2025.04.21**：发布 GraphGen 初始版本。
 
+</details>
+
+
 ## 🚀 快速开始
 
 通过 [Web](https://g-app-center-120612-6433-jpdvmvp.openxlab.space) 或 [备用 Web 入口](https://openxlab.org.cn/apps/detail/chenzihonga/GraphGen) 体验 GraphGen。
 
@@ -1,6 +1,9 @@
+import os
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List
 
+import requests
+
 
 class BaseReader(ABC):
     """
@@ -18,3 +21,45 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
         :param file_path: Path to the input file.
         :return: List of dictionaries containing the data.
         """
+
+    @staticmethod
+    def filter(data: List[dict]) -> List[dict]:
+        """
+        Filter out entries with empty or missing text in the specified column.
+
+        :param data: List of dictionaries containing the data.
+        :return: Filtered list of dictionaries.
+        """
+
+        def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+            """
+            Check if an image exists at the given local path or URL.
+            :param path_or_url: Local file path or remote URL of the image.
+            :param timeout: Timeout for remote URL requests in seconds.
+            :return: True if the image exists, False otherwise.
+            """
+            if not path_or_url:
+                return False
+            if not path_or_url.startswith(("http://", "https://", "ftp://")):
+                path = path_or_url.replace("file://", "", 1)
+                path = os.path.abspath(path)
+                return os.path.isfile(path)
+            try:
+                resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+                return resp.status_code == 200
+            except requests.RequestException:
+                return False
+
+        filtered_data = []
+        for item in data:
+            if item.get("type") == "text":
+                content = item.get("content", "").strip()
+                if content:
+                    filtered_data.append(item)
+            elif item.get("type") in ("image", "table", "equation"):
+                img_path = item.get("img_path")
+                if _image_exists(img_path):
+                    filtered_data.append(item)
+            else:
+                filtered_data.append(item)
+        return filtered_data
@@ -7,8 +7,18 @@
 class Chunk:
     id: str
     content: str
+    type: str
     metadata: dict = field(default_factory=dict)
 
+    @staticmethod
+    def from_dict(key: str, data: dict) -> "Chunk":
+        return Chunk(
+            id=key,
+            content=data.get("content", ""),
+            type=data.get("type", "unknown"),
+            metadata={k: v for k, v in data.items() if k != "content"},
+        )
+
 
 @dataclass
 class QAPair:
 
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -18,5 +18,5 @@ partition: # graph partition configuration
     max_tokens_per_community: 10240 # max tokens per community
     unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
 generate:
-  mode: aggregated # atomic, aggregated, multi_hop, cot
+  mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
   data_format: ChatML # Alpaca, Sharegpt, ChatML
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
+  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -15,5 +15,5 @@ partition: # graph partition configuration
   method_params:
     max_units_per_community: 1 # atomic partition, one node or edge per community
 generate:
-  mode: atomic # atomic, aggregated, multi_hop, cot
+  mode: atomic # atomic, aggregated, multi_hop, cot, vqa
   data_format: Alpaca # Alpaca, Sharegpt, ChatML
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt. See resources/input_examples for examples
+  input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -15,5 +15,5 @@ partition: # graph partition configuration
     use_lcc: false # whether to use the largest connected component
     random_seed: 42 # random seed for partitioning
 generate:
-  mode: cot # atomic, aggregated, multi_hop, cot
+  mode: cot # atomic, aggregated, multi_hop, cot, vqa
   data_format: Sharegpt # Alpaca, Sharegpt, ChatML
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
+  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -18,5 +18,5 @@ partition: # graph partition configuration
     max_tokens_per_community: 10240 # max tokens per community
     unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
 generate:
-  mode: multi_hop # strategy for generating multi-hop QA pairs
+  mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
   data_format: ChatML # Alpaca, Sharegpt, ChatML
@@ -0,0 +1,18 @@
+read:
+  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+split:
+  chunk_size: 1024 # chunk size for text splitting
+  chunk_overlap: 100 # chunk overlap for text splitting
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
+  enabled: false
+partition: # graph partition configuration
+  method: anchor_bfs # partition method
+  method_params:
+    anchor_type: image # node type to select anchor nodes
+    max_units_per_community: 10 # atomic partition, one node or edge per community
+generate:
+  mode: vqa # atomic, aggregated, multi_hop, cot, vqa
+  data_format: ChatML # Alpaca, Sharegpt, ChatML
@@ -72,24 +72,11 @@ def main():
 
     graph_gen.search(search_config=config["search"])
 
-    # Use pipeline according to the output data type
-    if mode in ["atomic", "aggregated", "multi_hop"]:
-        logger.info("Generation mode set to '%s'. Start generation.", mode)
-        if "quiz_and_judge" in config and config["quiz_and_judge"]["enabled"]:
-            graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
-        else:
-            logger.warning(
-                "Quiz and Judge strategy is disabled. Edge sampling falls back to random."
-            )
-            assert (
-                config["partition"]["method"] == "ece"
-                and "method_params" in config["partition"]
-            ), "Only ECE partition with edge sampling is supported."
-            config["partition"]["method_params"]["edge_sampling"] = "random"
-    elif mode == "cot":
-        logger.info("Generation mode set to 'cot'. Start generation.")
-    else:
-        raise ValueError(f"Unsupported output data type: {mode}")
+    if config.get("quiz_and_judge", {}).get("enabled"):
+        graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
+
+    # TODO: add data filtering step here in the future
+    # graph_gen.filter(filter_config=config["filter"])
 
     graph_gen.generate(
         partition_config=config["partition"],