Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sphinx_doc/source/tutorial/trinity_configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,13 @@ Used to log training metrics during execution.
```yaml
monitor:
monitor_type: wandb
enable_ray_timeline: False
```

- `monitor_type`: Type of monitoring system. Options:
- `wandb`: Logs to [Weights & Biases](https://docs.wandb.ai/quickstart/). Requires logging in and setting `WANDB_API_KEY`. Project and run names match the `project` and `name` fields in global configs.
- `tensorboard`: Logs to [TensorBoard](https://www.tensorflow.org/tensorboard). Files are saved under `<checkpoint_root_dir>/<project>/<name>/monitor/tensorboard`.
- `enable_ray_timeline`: Whether to export the ray timeline. If set to `True`, a `timeline.json` file will be exported to `<checkpoint_root_dir>/<project>/<name>/monitor`. You can view the timeline file in Chrome at [chrome://tracing](chrome://tracing).

---

Expand Down
1 change: 1 addition & 0 deletions tests/trainer/trainer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def test_trainer(self):
response_metrics = parser.metric_list("response_length")
self.assertTrue(len(response_metrics) > 0)
self.assertEqual(parser.metric_max_step(response_metrics[0]), 4)
ray.timeline(filename="timeline.json")
ray.shutdown(_exiting_interpreter=True)
# check checkpoint
from trinity.common.models.utils import get_checkpoint_dir_with_step_num
Expand Down
33 changes: 20 additions & 13 deletions trinity/cli/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,19 +237,26 @@ def run(config_path: str, dlc: bool = False, plugin_dir: str = None):
if not is_running:
raise RuntimeError("Ray is not running, please start it by `ray start --head`.")
ray.init(namespace=config.ray_namespace, ignore_reinit_error=True)
if config.mode == "explore":
explore(config)
elif config.mode == "train":
train(config)
elif config.mode == "both":
both(config)
elif config.mode == "bench":
bench(config)

if dlc:
from trinity.utils.dlc_utils import stop_ray_cluster

stop_ray_cluster(namespace=config.ray_namespace)
try:
if config.mode == "explore":
explore(config)
elif config.mode == "train":
train(config)
elif config.mode == "both":
both(config)
elif config.mode == "bench":
bench(config)
finally:
if config.monitor.enable_ray_timeline:
timeline_file = os.path.join(config.monitor.cache_dir, "timeline.json")
logger.info(f"Exporting Ray timeline to {timeline_file}...")
ray.timeline(filename=timeline_file)
logger.info("Done. You can open the timeline file in `chrome://tracing`")

if dlc:
from trinity.utils.dlc_utils import stop_ray_cluster

stop_ray_cluster(namespace=config.ray_namespace)


def studio(port: int = 8501):
Expand Down
3 changes: 3 additions & 0 deletions trinity/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,9 @@ class MonitorConfig:
monitor_type: str = "tensorboard"
# the default args for monitor
monitor_args: Dict = field(default_factory=dict)
# whether to enable ray timeline profile
# the output file will be saved to `cache_dir/timeline.json`
enable_ray_timeline: bool = False
# ! DO NOT SET, automatically generated as checkpoint_job_dir/monitor
cache_dir: str = ""

Expand Down