garyzhang99
diff --git a/‎docs/sphinx_doc/source/tutorial/develop_workflow.md‎
Lines changed: 4 additions & 3 deletions b/‎docs/sphinx_doc/source/tutorial/develop_workflow.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎docs/sphinx_doc/source_zh/tutorial/develop_workflow.md‎
Lines changed: 6 additions & 5 deletions b/‎docs/sphinx_doc/source_zh/tutorial/develop_workflow.md‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎tests/algorithm/kl_fn_test.py‎
Lines changed: 131 additions & 0 deletions b/‎tests/algorithm/kl_fn_test.py‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎tests/algorithm/policy_loss_test.py‎
Lines changed: 17 additions & 0 deletions b/‎tests/algorithm/policy_loss_test.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎tests/cli/launcher_test.py‎
Lines changed: 74 additions & 25 deletions b/‎tests/cli/launcher_test.py‎
Lines changed: 74 additions & 25 deletions
diff --git a/‎tests/utils/plugins/dependencies.py‎
Lines changed: 7 additions & 0 deletions b/‎tests/utils/plugins/dependencies.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎tests/utils/plugins/main.py‎
Lines changed: 17 additions & 0 deletions b/‎tests/utils/plugins/main.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎tests/utils/registry_test.py‎
Lines changed: 29 additions & 0 deletions b/‎tests/utils/registry_test.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎trinity/algorithm/algorithm.py‎
Lines changed: 27 additions & 0 deletions b/‎trinity/algorithm/algorithm.py‎
Lines changed: 27 additions & 0 deletions
@@ -513,13 +513,14 @@ Here, `<config_file_path>` is the path to a YAML configuration file, which shoul
 Once started, the model will keep running and wait for debug instructions; it will not exit automatically. You can then run the following command in another terminal to debug your workflow:
 
 ```bash
-trinity debug --config <config_file_path> --module workflow --output-file <output_file_path> --plugin-dir <plugin_dir>
+trinity debug --config <config_file_path> --module workflow --output-dir <output_dir> --plugin-dir <plugin_dir> --enable-profiling
 ```
 
 - `<config_file_path>`: Path to the YAML configuration file, usually the same as used for starting the inference model.
-- `<output_file_path>`: Path to save the performance profiling results. Debug Mode uses [viztracer](https://github.com/gaogaotiantian/viztracer) to profile the workflow execution and saves the results as an HTML file for easy viewing in a browser.
+- `<output_dir>`: Directory to save the debug output. If not specified, the output will be saved to the `debug_output` in the current working directory.
 - `<plugin_dir>` (optional): Path to the plugin directory. If your workflow or reward function modules are not built into Trinity-RFT, you can specify this parameter to load custom modules.
+- `--enable-profiling` (optional): Enable performance profiling using [viztracer](https://github.com/gaogaotiantian/viztracer).
 
-During debugging, the `buffer.explorer_input.taskset` field in the config will be loaded to initialize the workflow's required task dataset and instance. Note that Debug Mode only reads the first sample in the dataset for testing. After running the above command, the workflow's return value will be automatically formatted and printed in the terminal for easy inspection.
+During debugging, the `buffer.explorer_input.taskset` field in the config will be loaded to initialize the workflow's required task dataset and instance. Note that Debug Mode only reads the first sample in the dataset for testing. After running the above command, the workflow's return value will be automatically formatted and printed in the terminal for easy inspection and the output experiences will be saved to the `<output_dir>/experiences.db` file.
 
 When debugging is complete, you can terminate the inference model by pressing `Ctrl+C` in its terminal.
@@ -509,13 +509,14 @@ trinity debug --config <config_file_path> --module inference_model
 模型启动后会持续运行并等待调试指令，不会自动退出。此时，你可在另一个终端执行如下命令进行 Workflow 调试：
 
 ```bash
-trinity debug --config <config_file_path> --module workflow --output-file <output_file_path> --plugin-dir <plugin_dir>
+trinity debug --config <config_file_path> --module workflow --output-dir <output_dir> --plugin-dir <plugin_dir> --enable-profiling
 ```
 
-- `config_file_path`：YAML 配置文件路径，通常与启动推理模型时使用的配置文件相同。
-- `output_file_path`：性能分析结果输出路径。调试模式会使用 [viztracer](https://github.com/gaogaotiantian/viztracer) 对 Workflow 运行过程进行性能分析，并将结果保存为 HTML 文件，便于在浏览器中查看。
-- `plugin_dir`（可选）：插件目录路径。如果你的 Workflow 或奖励函数等模块未内置于 Trinity-RFT，可通过该参数加载自定义模块。
+- `<config_file_path>`：YAML 配置文件路径，通常与启动推理模型时使用的配置文件相同。
+- `<output_dir>`：调试输出保存目录。如果未指定，调试输出将保存在当前工作目录下的 `debug_output` 目录中。
+- `<plugin_dir>`（可选）：插件目录路径。如果你的 Workflow 或奖励函数等模块未内置于 Trinity-RFT，可通过该参数加载自定义模块。
+- `--enable-profiling`（可选）：启用性能分析，使用 [viztracer](https://github.com/gaogaotiantian/viztracer) 对 Workflow 运行过程进行性能分析。
 
-调试过程中，配置文件中的 `buffer.explorer_input.taskset` 字段会被加载，用于初始化 Workflow 所需的任务数据集和实例。需注意，调试模式仅会读取数据集中的第一条数据进行测试。运行上述命令后，Workflow 的返回值会自动格式化并打印在终端，方便查看运行结果。
+调试过程中，配置文件中的 `buffer.explorer_input.taskset` 字段会被加载，用于初始化 Workflow 所需的任务数据集和实例。需注意，调试模式仅会读取数据集中的第一条数据进行测试。运行上述命令后，Workflow 的返回值会自动格式化并打印在终端以供观察和查看，同时产出的 Experience 会保存到 `<output_dir>/experiences.db` 数据库中。
 
 调试完成后，可在推理模型终端输入 `Ctrl+C` 以终止模型运行。
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+"""Test for KL functions"""
+
+import unittest
+
+import torch
+
+from trinity.algorithm.kl_fn.kl_fn import KL_FN
+
+
+class KLFnTest(unittest.TestCase):
+    def setUp(self):
+        seed = 42
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+        shape = (4, 10)
+        self.logprob = 2 * torch.rand(shape) - 1
+        self.ref_logprob = 2 * torch.rand(shape) - 1
+        self.old_logprob = 2 * torch.rand(shape) - 1
+        self.response_mask = torch.rand(shape) > 0.5
+
+    def test_k1_kl_fn(self):
+        kl_fn_cls = KL_FN.get("k1")
+        kl_fn = kl_fn_cls(kl_coef=0.01)
+        kl = kl_fn.calculate_kl(self.logprob, self.ref_logprob)
+        expected_kl = self.logprob - self.ref_logprob
+        self.assertTrue(torch.allclose(kl, expected_kl))
+
+    def test_k2_kl_fn(self):
+        kl_fn_cls = KL_FN.get("k2")
+        kl_fn = kl_fn_cls(kl_coef=0.01)
+        kl = kl_fn.calculate_kl(self.logprob, self.ref_logprob)
+        expected_kl = (self.logprob - self.ref_logprob).square() * 0.5
+        self.assertTrue(torch.allclose(kl, expected_kl))
+
+    def test_k3_kl_fn(self):
+        kl_fn_cls = KL_FN.get("k3")
+        kl_fn = kl_fn_cls(kl_coef=0.01)
+        kl = kl_fn.calculate_kl(self.logprob, self.ref_logprob)
+        logr = self.ref_logprob - self.logprob
+        expected_kl = logr.exp() - 1 - logr
+        self.assertTrue(torch.allclose(kl, expected_kl))
+
+    def test_abs_kl_fn(self):
+        kl_fn_cls = KL_FN.get("abs")
+        kl_fn = kl_fn_cls(kl_coef=0.01)
+        kl = kl_fn.calculate_kl(self.logprob, self.ref_logprob)
+        expected_kl = torch.abs(self.logprob - self.ref_logprob)
+        self.assertTrue(torch.allclose(kl, expected_kl))
+
+    def test_low_var_kl_fn(self):
+        kl_fn_cls = KL_FN.get("low_var_kl")
+        kl_fn = kl_fn_cls(kl_coef=0.01)
+        kl = kl_fn.calculate_kl(self.logprob, self.ref_logprob)
+        kl_intermediate = self.ref_logprob - self.logprob
+        kl_intermediate = torch.clamp(kl_intermediate, min=-20, max=20)
+        ratio = torch.exp(kl_intermediate)
+        expected_kl = torch.clamp((ratio - kl_intermediate - 1).contiguous(), min=-10, max=10)
+        self.assertTrue(torch.allclose(kl, expected_kl))
+
+    def test_dummy_kl_fn(self):
+        kl_fn_cls = KL_FN.get("none")
+        kl_fn = kl_fn_cls(kl_coef=0.01)
+        kl = kl_fn.calculate_kl(self.logprob, self.ref_logprob)
+        expected_kl = torch.zeros_like(self.logprob)
+        self.assertTrue(torch.allclose(kl, expected_kl))
+
+    def test_corrected_k3_fallback(self):
+        k3_fn = KL_FN.get("k3")(kl_coef=0.01)
+        corrected_k3_fn = KL_FN.get("corrected_k3")(kl_coef=0.01)
+        kl_standard = k3_fn.calculate_kl(self.logprob, self.ref_logprob)
+        kl_corrected_no_old = corrected_k3_fn.calculate_kl(
+            self.logprob, self.ref_logprob, old_logprob=None
+        )
+        self.assertTrue(torch.allclose(kl_standard, kl_corrected_no_old))
+
+    def test_corrected_k3_with_old_logprob(self):
+        corrected_k3_fn = KL_FN.get("corrected_k3")(kl_coef=0.01)
+        kl_corrected = corrected_k3_fn.calculate_kl(
+            self.logprob, self.ref_logprob, self.old_logprob
+        )
+        logr = self.ref_logprob - self.logprob
+        kl_standard = logr.exp() - 1 - logr
+        log_ratio_is = self.logprob - self.old_logprob
+        ratio_is = log_ratio_is.exp()
+        ratio_is = torch.clamp(ratio_is, min=0.0, max=2.0)
+        expected_kl = ratio_is * kl_standard
+        self.assertTrue(torch.allclose(kl_corrected, expected_kl))
+
+    def test_corrected_k3_same_policy(self):
+        k3_fn = KL_FN.get("k3")(kl_coef=0.01)
+        corrected_k3_fn = KL_FN.get("corrected_k3")(kl_coef=0.01)
+        kl_standard = k3_fn.calculate_kl(self.logprob, self.ref_logprob)
+        kl_corrected = corrected_k3_fn.calculate_kl(self.logprob, self.ref_logprob, self.logprob)
+        self.assertTrue(torch.allclose(kl_standard, kl_corrected, rtol=1e-4, atol=1e-6))
+
+    def test_corrected_k3_loss(self):
+        corrected_k3_fn = KL_FN.get("corrected_k3")(kl_coef=0.01)
+        kl_loss, metrics = corrected_k3_fn.calculate_kl_loss(
+            logprob=self.logprob,
+            ref_logprob=self.ref_logprob,
+            response_mask=self.response_mask,
+            loss_agg_mode="token-mean",
+            old_logprob=self.old_logprob,
+        )
+        self.assertEqual(kl_loss.dim(), 0)
+        self.assertIn("kl_loss", metrics)
+        self.assertIn("kl_coef", metrics)
+        self.assertEqual(metrics["kl_coef"], 0.01)
+
+    def test_kl_loss_aggregation_modes(self):
+        corrected_k3_fn = KL_FN.get("corrected_k3")(kl_coef=0.01)
+        kl_loss_mean, _ = corrected_k3_fn.calculate_kl_loss(
+            logprob=self.logprob,
+            ref_logprob=self.ref_logprob,
+            response_mask=self.response_mask,
+            loss_agg_mode="token-mean",
+            old_logprob=self.old_logprob,
+        )
+        kl_loss_sum, _ = corrected_k3_fn.calculate_kl_loss(
+            logprob=self.logprob,
+            ref_logprob=self.ref_logprob,
+            response_mask=self.response_mask,
+            loss_agg_mode="seq-mean-token-sum",
+            old_logprob=self.old_logprob,
+        )
+        self.assertGreater(kl_loss_sum.item(), kl_loss_mean.item())
@@ -142,3 +142,20 @@ def test_ppo_policy_loss_with_sequence_masking(self):
         self.assertTrue(
             torch.allclose(torch.tensor(metrics["seq_mask/mean_sequence_kl"]), mean_sequence_kl)
         )
+        
+    def test_sapo_policy_loss(self):
+        policy_loss_fn_cls = POLICY_LOSS_FN.get("sapo")
+        policy_loss_fn_args = policy_loss_fn_cls.default_args()
+        policy_loss_fn = policy_loss_fn_cls(**policy_loss_fn_args)
+        loss, metrics = policy_loss_fn(log_prob=self.logprob, **self.input_data.batch)
+        sapo_loss = torch.tensor(-0.05128994956612587)
+        ppo_kl = torch.tensor(-0.21663446724414825)
+        avg_soft_gate = torch.tensor(2.3191137313842773)
+        avg_ratio = torch.tensor(1.630766749382019)
+        pos_adv_frac = torch.tensor(0.3958333432674408)
+        self.assertTrue(torch.allclose(loss, sapo_loss))
+        self.assertTrue(torch.allclose(torch.tensor(metrics["sapo_loss"]), sapo_loss))
+        self.assertTrue(torch.allclose(torch.tensor(metrics["ppo_kl"]), ppo_kl))
+        self.assertTrue(torch.allclose(torch.tensor(metrics["avg_soft_gate"]), avg_soft_gate))
+        self.assertTrue(torch.allclose(torch.tensor(metrics["avg_ratio"]), avg_ratio))
+        self.assertTrue(torch.allclose(torch.tensor(metrics["pos_adv_frac"]), pos_adv_frac))
@@ -41,6 +41,7 @@ def setUp(self):
 
     def tearDown(self):
         sys.argv = self._orig_argv
+        shutil.rmtree(self.config.checkpoint_job_dir, ignore_errors=True)
 
     @mock.patch("trinity.cli.launcher.serve")
     @mock.patch("trinity.cli.launcher.explore")
@@ -254,31 +255,79 @@ def test_multi_stage_run(
     @mock.patch("trinity.cli.launcher.load_config")
     def test_debug_mode(self, mock_load):
         process = multiprocessing.Process(target=debug_inference_model_process)
-        process.start()
-        time.sleep(15)  # wait for the model to be created
-        for _ in range(10):
-            try:
-                get_debug_inference_model(self.config)
-                break
-            except Exception:
-                time.sleep(3)
-        output_file = os.path.join(self.config.checkpoint_job_dir, "debug.html")
-        self.config.buffer.explorer_input.tasksets = [get_unittest_dataset_config("gsm8k")]
-        mock_load.return_value = self.config
-        with mock.patch(
-            "argparse.ArgumentParser.parse_args",
-            return_value=mock.Mock(
-                command="debug",
-                config="dummy.yaml",
-                module="workflow",
-                output_file=output_file,
-                plugin_dir="",
-            ),
-        ):
-            launcher.main()
-        process.join(timeout=10)
-        process.terminate()
-        self.assertTrue(os.path.exists(output_file))
+        try:
+            process.start()
+            time.sleep(15)  # wait for the model to be created
+            for _ in range(10):
+                try:
+                    get_debug_inference_model(self.config)
+                    break
+                except Exception:
+                    time.sleep(3)
+            output_file = os.path.join(self.config.checkpoint_job_dir, "debug.html")
+            output_dir = os.path.join(self.config.checkpoint_job_dir, "debug_output")
+            self.config.buffer.explorer_input.tasksets = [get_unittest_dataset_config("gsm8k")]
+            mock_load.return_value = self.config
+            with mock.patch(
+                "argparse.ArgumentParser.parse_args",
+                return_value=mock.Mock(
+                    command="debug",
+                    config="dummy.yaml",
+                    module="workflow",
+                    enable_profiling=True,
+                    output_dir=output_dir,
+                    output_file=output_file,
+                    plugin_dir="",
+                ),
+            ):
+                launcher.main()
+
+            self.assertFalse(os.path.exists(output_file))
+            self.assertTrue(os.path.exists(output_dir))
+            self.assertTrue(os.path.exists(os.path.join(output_dir, "profiling.html")))
+            self.assertTrue(os.path.exists(os.path.join(output_dir, "experiences.db")))
+            # add a dummy file to test overwrite behavior
+            with open(os.path.join(output_dir, "dummy.txt"), "w") as f:
+                f.write("not empty")
+
+            with mock.patch(
+                "argparse.ArgumentParser.parse_args",
+                return_value=mock.Mock(
+                    command="debug",
+                    config="dummy.yaml",
+                    module="workflow",
+                    enable_profiling=False,
+                    output_dir=output_dir,
+                    output_file=output_file,
+                    plugin_dir="",
+                ),
+            ):
+                launcher.main()
+
+            self.assertFalse(os.path.exists(output_file))
+            # test the original files are not overwritten
+            self.assertTrue(os.path.exists(output_dir))
+            self.assertTrue(os.path.exists(os.path.join(output_dir, "dummy.txt")))
+            dirs = os.listdir(self.config.checkpoint_job_dir)
+            target_output_dir = [d for d in dirs if d.startswith("debug_output_")]
+            self.assertEqual(len(target_output_dir), 1)
+            self.assertFalse(
+                os.path.exists(
+                    os.path.join(
+                        self.config.checkpoint_job_dir, target_output_dir[0], "profiling.html"
+                    )
+                )
+            )
+            self.assertTrue(
+                os.path.exists(
+                    os.path.join(
+                        self.config.checkpoint_job_dir, target_output_dir[0], "experiences.db"
+                    )
+                )
+            )
+        finally:
+            process.join(timeout=10)
+            process.terminate()
 
 
 def debug_inference_model_process():
 
@@ -0,0 +1,7 @@
+"""A file contains some dependencies."""
+
+DEPENDENCY_VALUE = 0
+
+
+def dependency_func():
+    return "0"
@@ -0,0 +1,17 @@
+from tests.utils.plugins.dependencies import DEPENDENCY_VALUE, dependency_func
+from trinity.common.workflows.workflow import Workflow
+
+
+class MainDummyWorkflow(Workflow):
+    def __init__(self, *, task, model, auxiliary_models=None):
+        super().__init__(task=task, model=model, auxiliary_models=auxiliary_models)
+
+    @property
+    def repeatable(self):
+        return True
+
+    def set_repeat_times(self, repeat_times, run_id_base):
+        pass
+
+    def run(self) -> list:
+        return [DEPENDENCY_VALUE, dependency_func()]
@@ -0,0 +1,29 @@
+import unittest
+
+import ray
+
+
+class ImportUtils:
+    def run(self):
+        from trinity.common.workflows import WORKFLOWS, Workflow
+
+        workflow_cls = WORKFLOWS.get("tests.utils.plugins.main.MainDummyWorkflow")
+        assert issubclass(workflow_cls, Workflow)
+        workflow = workflow_cls(task=None, model=None)
+        res = workflow.run()
+        assert res[0] == 0
+        assert res[1] == "0"
+
+
+class TestRegistry(unittest.TestCase):
+    def setUp(self):
+        ray.init(ignore_reinit_error=True)
+
+    def tearDown(self):
+        ray.shutdown()
+
+    def test_dynamic_import(self):
+        # test local import
+        ImportUtils().run()
+        # test remote import
+        ray.get(ray.remote(ImportUtils).remote().run.remote())
@@ -250,6 +250,33 @@ def default_config(cls) -> Dict:
         }
 
 
+@ALGORITHM_TYPE.register_module("sapo")
+class SAPOAlgorithm(AlgorithmType):
+    """SAPO (Soft Adaptive Policy Optimization) algorithm.
+
+    SAPO uses a smooth, temperature-controlled soft gate instead of hard clipping
+    to stabilize training while maintaining effective learning.
+    """
+
+    use_critic: bool = False
+    use_reference: bool = True
+    compute_advantage_in_trainer: bool = False
+    can_balance_batch: bool = True
+    schema: str = "experience"
+
+    @classmethod
+    def default_config(cls) -> Dict:
+        return {
+            "repeat_times": 2,
+            "advantage_fn": "grpo",
+            "sample_strategy": "default",
+            "policy_loss_fn": "sapo",
+            "kl_penalty_fn": "none",
+            "kl_loss_fn": "k2",
+            "entropy_loss_fn": "default",
+        }
+
+
 @ALGORITHM_TYPE.register_module("mix")
 class MIXAlgorithm(AlgorithmType):
     """MIX algorithm."""