Merge pull request #56 from open-sciencelab/e2e-tests

ChenZiHong-Gavin · web-flow · commit 7186cfe7727e · 2025-09-29T17:56:58.000+08:00
tests: add e2e tests
diff --git a/graphgen/generate.py b/graphgen/generate.py
@@ -16,8 +16,6 @@
 
 def set_working_dir(folder):
     os.makedirs(folder, exist_ok=True)
-    os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True)
-    os.makedirs(os.path.join(folder, "logs"), exist_ok=True)
 
 
 def save_config(config_path, global_config):
@@ -48,17 +46,20 @@ def main():
     args = parser.parse_args()
 
     working_dir = args.output_dir
-    set_working_dir(working_dir)
 
     with open(args.config_file, "r", encoding="utf-8") as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
 
     output_data_type = config["output_data_type"]
     unique_id = int(time.time())
+
+    output_path = os.path.join(
+        working_dir, "data", "graphgen", f"{unique_id}_{output_data_type}"
+    )
+    set_working_dir(output_path)
+
     set_logger(
-        os.path.join(
-            working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
-        ),
+        os.path.join(output_path, f"{unique_id}.log"),
         if_stream=True,
     )
     logger.info(
@@ -94,8 +95,7 @@ def main():
     else:
         raise ValueError(f"Unsupported output data type: {output_data_type}")
 
-    output_path = os.path.join(working_dir, "data", "graphgen", str(unique_id))
-    save_config(os.path.join(output_path, f"config-{unique_id}.yaml"), config)
+    save_config(os.path.join(output_path, "config.yaml"), config)
     logger.info("GraphGen completed successfully. Data saved to %s", output_path)
 
 
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -102,8 +102,13 @@ def __post_init__(self):
             self.working_dir, namespace="rephrase"
         )
         self.qa_storage: JsonListStorage = JsonListStorage(
-            os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)),
-            namespace=f"qa-{self.unique_id}",
+            os.path.join(
+                self.working_dir,
+                "data",
+                "graphgen",
+                f"{self.unique_id}_{self.config['output_data_type']}",
+            ),
+            namespace="qa",
         )
 
     async def async_split_chunks(self, data: List[Union[List, Dict]]) -> dict:
diff --git a/tests/e2e_tests/__init__.py b/tests/e2e_tests/__init__.py
diff --git a/tests/e2e_tests/test_generate_aggregated.py b/tests/e2e_tests/test_generate_aggregated.py
@@ -0,0 +1,50 @@
+import json
+import os
+import subprocess
+from pathlib import Path
+
+
+def test_generate_aggregated(tmp_path: Path):
+    repo_root = Path(__file__).resolve().parents[2]
+    os.chdir(repo_root)
+
+    config_path = repo_root / "graphgen" / "configs" / "aggregated_config.yaml"
+    output_dir = tmp_path / "output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "graphgen.generate",
+            "--config_file",
+            str(config_path),
+            "--output_dir",
+            str(output_dir),
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert result.returncode == 0, f"Script failed with error: {result.stderr}"
+
+    data_root = output_dir / "data" / "graphgen"
+    assert data_root.exists(), f"{data_root} does not exist"
+    run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True)
+    assert run_folders, f"No run folders found in {data_root}"
+    run_folder = run_folders[0]
+
+    config_saved = run_folder / "config.yaml"
+    assert config_saved.exists(), f"{config_saved} not found"
+
+    json_files = list(run_folder.glob("*.json"))
+    assert json_files, f"No JSON output found in {run_folder}"
+
+    log_files = list(run_folder.glob("*.log"))
+    assert log_files, "No log file generated"
+
+    with open(json_files[0], "r", encoding="utf-8") as f:
+        data = json.load(f)
+    assert (
+        isinstance(data, list) and len(data) > 0
+    ), "JSON output is empty or not a list"
diff --git a/tests/e2e_tests/test_generate_atomic.py b/tests/e2e_tests/test_generate_atomic.py
@@ -0,0 +1,50 @@
+import json
+import os
+import subprocess
+from pathlib import Path
+
+
+def test_generate_atomic(tmp_path: Path):
+    repo_root = Path(__file__).resolve().parents[2]
+    os.chdir(repo_root)
+
+    config_path = repo_root / "graphgen" / "configs" / "atomic_config.yaml"
+    output_dir = tmp_path / "output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "graphgen.generate",
+            "--config_file",
+            str(config_path),
+            "--output_dir",
+            str(output_dir),
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert result.returncode == 0, f"Script failed with error: {result.stderr}"
+
+    data_root = output_dir / "data" / "graphgen"
+    assert data_root.exists(), f"{data_root} does not exist"
+    run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True)
+    assert run_folders, f"No run folders found in {data_root}"
+    run_folder = run_folders[0]
+
+    config_saved = run_folder / "config.yaml"
+    assert config_saved.exists(), f"{config_saved} not found"
+
+    json_files = list(run_folder.glob("*.json"))
+    assert json_files, f"No JSON output found in {run_folder}"
+
+    log_files = list(run_folder.glob("*.log"))
+    assert log_files, "No log file generated"
+
+    with open(json_files[0], "r", encoding="utf-8") as f:
+        data = json.load(f)
+    assert (
+        isinstance(data, list) and len(data) > 0
+    ), "JSON output is empty or not a list"
diff --git a/tests/e2e_tests/test_generate_cot.py b/tests/e2e_tests/test_generate_cot.py
@@ -0,0 +1,50 @@
+import json
+import os
+import subprocess
+from pathlib import Path
+
+
+def test_generate_aggregated(tmp_path: Path):
+    repo_root = Path(__file__).resolve().parents[2]
+    os.chdir(repo_root)
+
+    config_path = repo_root / "graphgen" / "configs" / "cot_config.yaml"
+    output_dir = tmp_path / "output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "graphgen.generate",
+            "--config_file",
+            str(config_path),
+            "--output_dir",
+            str(output_dir),
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert result.returncode == 0, f"Script failed with error: {result.stderr}"
+
+    data_root = output_dir / "data" / "graphgen"
+    assert data_root.exists(), f"{data_root} does not exist"
+    run_folders = sorted(data_root.iterdir(), key=lambda p: p.name, reverse=True)
+    assert run_folders, f"No run folders found in {data_root}"
+    run_folder = run_folders[0]
+
+    config_saved = run_folder / "config.yaml"
+    assert config_saved.exists(), f"{config_saved} not found"
+
+    json_files = list(run_folder.glob("*.json"))
+    assert json_files, f"No JSON output found in {run_folder}"
+
+    log_files = list(run_folder.glob("*.log"))
+    assert log_files, "No log file generated"
+
+    with open(json_files[0], "r", encoding="utf-8") as f:
+        data = json.load(f)
+    assert (
+        isinstance(data, list) and len(data) > 0
+    ), "JSON output is empty or not a list"
diff --git a/tests/e2e_tests/test_generate_multi_hop.py b/tests/e2e_tests/test_generate_multi_hop.py
@@ -0,0 +1,50 @@
+import json
+import os
+import subprocess
+from pathlib import Path
+
+
+def test_generate_aggregated(tmp_path: Path):
+    repo_root = Path(__file__).resolve().parents[2]
+    os.chdir(repo_root)
+
+    config_path = repo_root / "graphgen" / "configs" / "multi_hop_config.yaml"
+    output_dir = tmp_path / "output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "graphgen.generate",
+            "--config_file",
+            str(config_path),
+            "--output_dir",
+            str(output_dir),
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert result.returncode == 0, f"Script failed with error: {result.stderr}"
+
+    data_root = output_dir / "data" / "graphgen"
+    assert data_root.exists() and data_root.is_dir(), f"{data_root} does not exist or is not a directory"
+    run_folders = sorted(list(data_root.iterdir()), key=lambda p: p.name, reverse=True)
+    assert run_folders, f"No run folders found in {data_root}"
+    run_folder = run_folders[0]
+
+    config_saved = run_folder / "config.yaml"
+    assert config_saved.exists(), f"{config_saved} not found"
+
+    json_files = list(run_folder.glob("*.json"))
+    assert json_files, f"No JSON output found in {run_folder}"
+
+    log_files = list(run_folder.glob("*.log"))
+    assert log_files, "No log file generated"
+
+    with open(json_files[0], "r", encoding="utf-8") as f:
+        data = json.load(f)
+    assert (
+        isinstance(data, list) and len(data) > 0
+    ), "JSON output is empty or not a list"