Skip to content

Commit 178c992

Browse files
authored
Fix DLC mode (#71)
1 parent e76403a commit 178c992

File tree

2 files changed

+48
-9
lines changed

2 files changed

+48
-9
lines changed

trinity/cli/launcher.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,11 @@ def run(config_path: str, dlc: bool = False):
187187
elif config.mode == "bench":
188188
bench(config)
189189

190+
if dlc:
191+
from trinity.utils.dlc_utils import stop_ray_cluster
192+
193+
stop_ray_cluster()
194+
190195

191196
def studio(port: int = 8501):
192197
from streamlit.web import cli as stcli

trinity/utils/dlc_utils.py

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,20 @@
99

1010
logger = get_logger(__name__)
1111

12+
CLUSTER_ACTOR_NAME = "cluster_status"
13+
14+
15+
@ray.remote
16+
class ClusterStatus:
17+
def __init__(self):
18+
self.finished = False
19+
20+
def finish(self) -> None:
21+
self.finished = True
22+
23+
def running(self) -> bool:
24+
return not self.finished
25+
1226

1327
def get_dlc_env_vars() -> dict:
1428
envs = {
@@ -71,16 +85,36 @@ def setup_ray_cluster(namespace: str):
7185
logger.error(f"ret.stdout: {ret.stdout!r}")
7286
logger.error(f"ret.stderr: {ret.stderr!r}")
7387
sys.exit(1)
88+
89+
wait_for_ray_setup()
90+
ray.init(
91+
address=f"{env_vars['MASTER_ADDR']}:{env_vars['MASTER_PORT']}",
92+
namespace=namespace,
93+
ignore_reinit_error=True,
94+
)
7495
if is_master:
75-
wait_for_ray_setup()
76-
ray.init(
77-
address=f"{env_vars['MASTER_ADDR']}:{env_vars['MASTER_PORT']}",
78-
namespace=namespace,
79-
ignore_reinit_error=True,
80-
)
8196
# master wait for worker nodes to join
8297
wait_for_ray_worker_nodes(env_vars["WORLD_SIZE"])
98+
else:
99+
# woker wait on the cluster status actor
100+
cluster_status = ClusterStatus.options(
101+
name=CLUSTER_ACTOR_NAME,
102+
get_if_exists=True,
103+
).remote()
104+
while True:
105+
if ray.get(cluster_status.running.remote()):
106+
time.sleep(5)
107+
else:
108+
break
109+
sys.exit(0)
110+
83111

84-
if not is_master:
85-
# woker just exit
86-
sys.exit(0)
112+
def stop_ray_cluster():
113+
"""
114+
Stop the ray cluster by sending a signal to the cluster status actor.
115+
"""
116+
cluster_status = ClusterStatus.options(
117+
name=CLUSTER_ACTOR_NAME,
118+
get_if_exists=True,
119+
).remote()
120+
ray.get(cluster_status.finish.remote())

0 commit comments

Comments
 (0)