Merge branch 'main' of https://github.com/open-sciencelab/GraphGen into kg_builder

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 128d2f826867 · 2025-09-30T11:14:55.000+08:00
diff --git a/.github/sync-config.yml b/.github/sync-config.yml
@@ -13,7 +13,5 @@ sync:
     dest: app.py
   - source: requirements.txt
     dest: requirements.txt
-  - source: README_HF.md
-    dest: README.md
   - source: LICENSE
     dest: LICENSE
diff --git a/.github/workflows/push-to-hf.yml b/.github/workflows/push-to-hf.yml
@@ -43,7 +43,7 @@ jobs:
         [[ -d hf-repo ]] && rm -rf hf-repo
         git clone https://huggingface.co/${HF_REPO_TYPE}/${HF_REPO_ID} hf-repo
 
-        rsync -a --delete --exclude='.git' --exclude='hf-repo' ./ hf-repo/
+        rsync -a --delete --exclude='.git' --exclude='hf-repo' --exclude='README.md' ./ hf-repo/
 
         cd hf-repo
         git add .
diff --git a/.github/workflows/push-to-ms.yml b/.github/workflows/push-to-ms.yml
@@ -0,0 +1,50 @@
+name: Push demo branch to ModelScope
+
+on:
+  workflow_call:
+    inputs:
+      ref:
+        required: false
+        default: demo
+        type: string
+    secrets:
+      MS_TOKEN:
+        required: true
+
+jobs:
+  push-ms:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Configure Git identity
+        run: |
+          git config --global user.email "actions@github.com"
+          git config --global user.name  "github-actions[bot]"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # ModelScope official SDK (optional, install only if you need to call the platform API)
+          pip install modelscope
+
+      - name: Push to ModelScope
+        env:
+          MS_TOKEN: ${{ secrets.MS_TOKEN }}
+          MS_REPO_TYPE: studios
+          MS_REPO_ID: chenzihong/GraphGen
+        run: |
+          [[ -d ms-repo ]] && rm -rf ms-repo
+          git clone https://oauth2:${MS_TOKEN}@www.modelscope.cn/${MS_REPO_TYPE}/${MS_REPO_ID}.git ms-repo
+
+          rsync -a --delete --exclude='.git' --exclude='ms-repo' --exclude='README.md' ./ ms-repo/
+
+          cd ms-repo
+          git add .
+          git diff-index --quiet HEAD || \
+            (git commit -m "Auto-sync from ${{ inputs.ref }} at $(date -u)" && \
+              git push "https://oauth2:${MS_TOKEN}@www.modelscope.cn/${MS_REPO_TYPE}/${MS_REPO_ID}.git")
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/sync-demo.yml b/.github/workflows/sync-demo.yml
@@ -87,3 +87,10 @@ jobs:
     uses: ./.github/workflows/push-to-hf.yml
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  push-ms:
+    needs: sync-demo
+    uses: ./.github/workflows/push-to-ms.yml
+    secrets:
+      MS_TOKEN: ${{ secrets.MS_TOKEN }}
+    with:
+      ref: demo
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@
 [![Hugging Face](https://img.shields.io/badge/Paper-on%20HF-white?logo=huggingface&logoColor=yellow)](https://huggingface.co/papers/2505.20416)
 
 [![Hugging Face](https://img.shields.io/badge/Demo-on%20HF-blue?logo=huggingface&logoColor=yellow)](https://huggingface.co/spaces/chenzihong/GraphGen)
+[![Model Scope](https://img.shields.io/badge/%F0%9F%A4%96%20Demo-on%20MS-green)](https://modelscope.cn/studios/chenzihong/GraphGen)
 [![OpenXLab](https://img.shields.io/badge/Demo-on%20OpenXLab-blue?logo=openxlab&logoColor=yellow)](https://g-app-center-120612-6433-jpdvmvp.openxlab.space)
 
 
@@ -60,6 +61,7 @@ After data generation, you can use [LLaMA-Factory](https://github.com/hiyouga/LL
 
 ## 📌 Latest Updates
 
+- **2025.09.29**: We auto-update gradio demo on [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) and [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen).
 - **2025.08.14**: We have added support for community detection in knowledge graphs using the Leiden algorithm, enabling the synthesis of Chain-of-Thought (CoT) data.
 - **2025.07.31**: We have added Google, Bing, Wikipedia, and UniProt as search back-ends.
 - **2025.04.21**: We have released the initial version of GraphGen.
diff --git a/README_HF.md b/README_HF.md
diff --git a/README_ZH.md b/README_ZH.md
@@ -14,6 +14,7 @@
 [![Hugging Face](https://img.shields.io/badge/Paper-on%20HF-white?logo=huggingface&logoColor=yellow)](https://huggingface.co/papers/2505.20416)
 
 [![Hugging Face](https://img.shields.io/badge/Demo-on%20HF-blue?logo=huggingface&logoColor=yellow)](https://huggingface.co/spaces/chenzihong/GraphGen)
+[![Model Scope](https://img.shields.io/badge/%F0%9F%A4%96%20Demo-on%20MS-green)](https://modelscope.cn/studios/chenzihong/GraphGen)
 [![OpenXLab](https://img.shields.io/badge/Demo-on%20OpenXLab-blue?logo=openxlab&logoColor=yellow)](https://g-app-center-120612-6433-jpdvmvp.openxlab.space)
 
 GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation
@@ -61,6 +62,7 @@ GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期
 
 ## 📌 最新更新
 
+- **2025.09.29**：我们在 [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) 和 [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen) 上自动更新 Gradio 应用。
 - **2025.08.14**：支持利用 Leiden 社区发现算法对知识图谱进行社区划分，合成 CoT 数据。
 - **2025.07.31**：新增 Google、Bing、Wikipedia 和 UniProt 作为搜索后端，帮助填补数据缺口。  
 - **2025.04.21**：发布 GraphGen 初始版本。
diff --git a/graphgen/generate.py b/graphgen/generate.py
@@ -16,8 +16,6 @@
 
 def set_working_dir(folder):
     os.makedirs(folder, exist_ok=True)
-    os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True)
-    os.makedirs(os.path.join(folder, "logs"), exist_ok=True)
 
 
 def save_config(config_path, global_config):
@@ -48,24 +46,27 @@ def main():
     args = parser.parse_args()
 
     working_dir = args.output_dir
-    set_working_dir(working_dir)
 
     with open(args.config_file, "r", encoding="utf-8") as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
 
     output_data_type = config["output_data_type"]
     unique_id = int(time.time())
+
+    output_path = os.path.join(
+        working_dir, "data", "graphgen", f"{unique_id}_{output_data_type}"
+    )
+    set_working_dir(output_path)
+
     set_logger(
-        os.path.join(
-            working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
-        ),
+        os.path.join(output_path, f"{unique_id}.log"),
         if_stream=True,
     )
     logger.info(
         "GraphGen with unique ID %s logging to %s",
         unique_id,
         os.path.join(
-            working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
+            working_dir, "logs", f"{unique_id}_graphgen_{output_data_type}.log"
         ),
     )
 
@@ -94,8 +95,7 @@ def main():
     else:
         raise ValueError(f"Unsupported output data type: {output_data_type}")
 
-    output_path = os.path.join(working_dir, "data", "graphgen", str(unique_id))
-    save_config(os.path.join(output_path, f"config-{unique_id}.yaml"), config)
+    save_config(os.path.join(output_path, "config.yaml"), config)
     logger.info("GraphGen completed successfully. Data saved to %s", output_path)
 
 
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -99,8 +99,13 @@ def __post_init__(self):
             self.working_dir, namespace="rephrase"
         )
         self.qa_storage: JsonListStorage = JsonListStorage(
-            os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)),
-            namespace=f"qa-{self.unique_id}",
+            os.path.join(
+                self.working_dir,
+                "data",
+                "graphgen",
+                f"{self.unique_id}_{self.config['output_data_type']}",
+            ),
+            namespace="qa",
         )
 
     @async_to_sync_method
diff --git a/graphgen/utils/log.py b/graphgen/utils/log.py
@@ -1,32 +1,55 @@
 import logging
+from logging.handlers import RotatingFileHandler
+
+from rich.logging import RichHandler
 
 logger = logging.getLogger("graphgen")
 
-def set_logger(log_file: str, log_level: int = logging.INFO, if_stream: bool = True):
-    logger.setLevel(log_level)
 
-    formatter = logging.Formatter(
-        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-    )
+def set_logger(
+    log_file: str,
+    log_level: int = logging.INFO,
+    *,
+    if_stream: bool = True,
+    max_bytes: int = 50 * 1024 * 1024,  # 50 MB
+    backup_count: int = 5,
+    force: bool = False,
+):
 
-    file_handler = logging.FileHandler(log_file, mode='w')
-    file_handler.setLevel(log_level)
-    file_handler.setFormatter(formatter)
+    if logger.hasHandlers() and not force:
+        return
 
-    stream_handler = None
+    if force:
+        logger.handlers.clear()
 
-    if if_stream:
-        stream_handler = logging.StreamHandler()
-        stream_handler.setLevel(log_level)
-        stream_handler.setFormatter(formatter)
+    logger.setLevel(log_level)
+    logger.propagate = False
 
-    if not logger.handlers:
-        logger.addHandler(file_handler)
-        if if_stream and stream_handler:
-            logger.addHandler(stream_handler)
+    if logger.handlers:
+        logger.handlers.clear()
+
+    if if_stream:
+        console = RichHandler(level=log_level, show_path=False, rich_tracebacks=True)
+        console.setFormatter(logging.Formatter("%(message)s"))
+        logger.addHandler(console)
+
+    file_handler = RotatingFileHandler(
+        log_file,
+        maxBytes=max_bytes,
+        backupCount=backup_count,
+        encoding="utf-8",
+    )
+    file_handler.setLevel(log_level)
+    file_handler.setFormatter(
+        logging.Formatter(
+            "[%(asctime)s] %(levelname)s [%(name)s:%(filename)s:%(lineno)d] %(message)s",
+            datefmt="%y-%m-%d %H:%M:%S",
+        )
+    )
+    logger.addHandler(file_handler)
 
 
 def parse_log(log_file: str):
-    with open(log_file, "r", encoding='utf-8') as f:
+    with open(log_file, "r", encoding="utf-8") as f:
         lines = f.readlines()
     return lines
diff --git a/tests/e2e_tests/__init__.py b/tests/e2e_tests/__init__.py
diff --git a/tests/e2e_tests/test_generate_aggregated.py b/tests/e2e_tests/test_generate_aggregated.py
@@ -0,0 +1,50 @@
+import json
+import os
+import subprocess
+from pathlib import Path
+
+
+def test_generate_aggregated(tmp_path: Path):
+    repo_root = Path(__file__).resolve().parents[2]
+    os.chdir(repo_root)
+
+    config_path = repo_root / "graphgen" / "configs" / "aggregated_config.yaml"
+    output_dir = tmp_path / "output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "graphgen.generate",
+            "--config_file",
+            str(config_path),
+            "--output_dir",
+            str(output_dir),
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert result.returncode == 0, f"Script failed with error: {result.stderr}"
+
+    data_root = output_dir / "data" / "graphgen"
+    assert data_root.exists(), f"{data_root} does not exist"
+    run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True)
+    assert run_folders, f"No run folders found in {data_root}"
+    run_folder = run_folders[0]
+
+    config_saved = run_folder / "config.yaml"
+    assert config_saved.exists(), f"{config_saved} not found"
+
+    json_files = list(run_folder.glob("*.json"))
+    assert json_files, f"No JSON output found in {run_folder}"
+
+    log_files = list(run_folder.glob("*.log"))
+    assert log_files, "No log file generated"
+
+    with open(json_files[0], "r", encoding="utf-8") as f:
+        data = json.load(f)
+    assert (
+        isinstance(data, list) and len(data) > 0
+    ), "JSON output is empty or not a list"
diff --git a/tests/e2e_tests/test_generate_atomic.py b/tests/e2e_tests/test_generate_atomic.py
diff --git a/tests/e2e_tests/test_generate_cot.py b/tests/e2e_tests/test_generate_cot.py
diff --git a/tests/e2e_tests/test_generate_multi_hop.py b/tests/e2e_tests/test_generate_multi_hop.py