Add ray timeline for profiling (#98)

pan-x-c · web-flow · commit e4052f85a794 · 2025-06-24T17:01:47.000+08:00
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -107,11 +107,13 @@ Used to log training metrics during execution.
 ```yaml
 monitor:
   monitor_type: wandb
+  enable_ray_timeline: False
 ```
 
 - `monitor_type`: Type of monitoring system. Options:
   - `wandb`: Logs to [Weights & Biases](https://docs.wandb.ai/quickstart/). Requires logging in and setting `WANDB_API_KEY`. Project and run names match the `project` and `name` fields in global configs.
   - `tensorboard`: Logs to [TensorBoard](https://www.tensorflow.org/tensorboard). Files are saved under `<checkpoint_root_dir>/<project>/<name>/monitor/tensorboard`.
+- `enable_ray_timeline`: Whether to export the ray timeline. If set to `True`, a `timeline.json` file will be exported to `<checkpoint_root_dir>/<project>/<name>/monitor`. You can view the timeline file in Chrome at [chrome://tracing](chrome://tracing).
 
 ---
 
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
@@ -149,6 +149,7 @@ def test_trainer(self):
         response_metrics = parser.metric_list("response_length")
         self.assertTrue(len(response_metrics) > 0)
         self.assertEqual(parser.metric_max_step(response_metrics[0]), 4)
+        ray.timeline(filename="timeline.json")
         ray.shutdown(_exiting_interpreter=True)
         # check checkpoint
         from trinity.common.models.utils import get_checkpoint_dir_with_step_num
diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
@@ -237,19 +237,26 @@ def run(config_path: str, dlc: bool = False, plugin_dir: str = None):
         if not is_running:
             raise RuntimeError("Ray is not running, please start it by `ray start --head`.")
         ray.init(namespace=config.ray_namespace, ignore_reinit_error=True)
-    if config.mode == "explore":
-        explore(config)
-    elif config.mode == "train":
-        train(config)
-    elif config.mode == "both":
-        both(config)
-    elif config.mode == "bench":
-        bench(config)
-
-    if dlc:
-        from trinity.utils.dlc_utils import stop_ray_cluster
-
-        stop_ray_cluster(namespace=config.ray_namespace)
+    try:
+        if config.mode == "explore":
+            explore(config)
+        elif config.mode == "train":
+            train(config)
+        elif config.mode == "both":
+            both(config)
+        elif config.mode == "bench":
+            bench(config)
+    finally:
+        if config.monitor.enable_ray_timeline:
+            timeline_file = os.path.join(config.monitor.cache_dir, "timeline.json")
+            logger.info(f"Exporting Ray timeline to {timeline_file}...")
+            ray.timeline(filename=timeline_file)
+            logger.info("Done. You can open the timeline file in `chrome://tracing`")
+
+        if dlc:
+            from trinity.utils.dlc_utils import stop_ray_cluster
+
+            stop_ray_cluster(namespace=config.ray_namespace)
 
 
 def studio(port: int = 8501):
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -319,6 +319,9 @@ class MonitorConfig:
     monitor_type: str = "tensorboard"
     # the default args for monitor
     monitor_args: Dict = field(default_factory=dict)
+    # whether to enable ray timeline profile
+    # the output file will be saved to `cache_dir/timeline.json`
+    enable_ray_timeline: bool = False
     # ! DO NOT SET, automatically generated as checkpoint_job_dir/monitor
     cache_dir: str = ""