diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md index f01de817c1..f6a6d8c780 100644 --- a/docs/sphinx_doc/source/tutorial/trinity_configs.md +++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md @@ -107,11 +107,13 @@ Used to log training metrics during execution. ```yaml monitor: monitor_type: wandb + enable_ray_timeline: False ``` - `monitor_type`: Type of monitoring system. Options: - `wandb`: Logs to [Weights & Biases](https://docs.wandb.ai/quickstart/). Requires logging in and setting `WANDB_API_KEY`. Project and run names match the `project` and `name` fields in global configs. - `tensorboard`: Logs to [TensorBoard](https://www.tensorflow.org/tensorboard). Files are saved under `///monitor/tensorboard`. +- `enable_ray_timeline`: Whether to export the ray timeline. If set to `True`, a `timeline.json` file will be exported to `///monitor`. You can view the timeline file in Chrome at [chrome://tracing](chrome://tracing). --- diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py index 811a1ba64d..32d19e9190 100644 --- a/tests/trainer/trainer_test.py +++ b/tests/trainer/trainer_test.py @@ -149,6 +149,7 @@ def test_trainer(self): response_metrics = parser.metric_list("response_length") self.assertTrue(len(response_metrics) > 0) self.assertEqual(parser.metric_max_step(response_metrics[0]), 4) + ray.timeline(filename="timeline.json") ray.shutdown(_exiting_interpreter=True) # check checkpoint from trinity.common.models.utils import get_checkpoint_dir_with_step_num diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py index 124475137c..e4123820de 100644 --- a/trinity/cli/launcher.py +++ b/trinity/cli/launcher.py @@ -237,19 +237,26 @@ def run(config_path: str, dlc: bool = False, plugin_dir: str = None): if not is_running: raise RuntimeError("Ray is not running, please start it by `ray start --head`.") ray.init(namespace=config.ray_namespace, ignore_reinit_error=True) - if config.mode == "explore": - explore(config) - elif config.mode == "train": - train(config) - elif config.mode == "both": - both(config) - elif config.mode == "bench": - bench(config) - - if dlc: - from trinity.utils.dlc_utils import stop_ray_cluster - - stop_ray_cluster(namespace=config.ray_namespace) + try: + if config.mode == "explore": + explore(config) + elif config.mode == "train": + train(config) + elif config.mode == "both": + both(config) + elif config.mode == "bench": + bench(config) + finally: + if config.monitor.enable_ray_timeline: + timeline_file = os.path.join(config.monitor.cache_dir, "timeline.json") + logger.info(f"Exporting Ray timeline to {timeline_file}...") + ray.timeline(filename=timeline_file) + logger.info("Done. You can open the timeline file in `chrome://tracing`") + + if dlc: + from trinity.utils.dlc_utils import stop_ray_cluster + + stop_ray_cluster(namespace=config.ray_namespace) def studio(port: int = 8501): diff --git a/trinity/common/config.py b/trinity/common/config.py index bcce3bf217..5d60cf8c4c 100644 --- a/trinity/common/config.py +++ b/trinity/common/config.py @@ -319,6 +319,9 @@ class MonitorConfig: monitor_type: str = "tensorboard" # the default args for monitor monitor_args: Dict = field(default_factory=dict) + # whether to enable ray timeline profile + # the output file will be saved to `cache_dir/timeline.json` + enable_ray_timeline: bool = False # ! DO NOT SET, automatically generated as checkpoint_job_dir/monitor cache_dir: str = ""