Skip to content

Commit 7163eeb

Browse files
committed
Added a max retry limit for elasticity
1 parent 9f4cb70 commit 7163eeb

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

axlearn/common/launch_trainer.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,10 @@ def _run_trainer_impl(trainer_config: SpmdTrainer.Config) -> Any:
152152
# pylint: disable-next=import-error,import-outside-toplevel
153153
from pathwaysutils.elastic import manager
154154
elastic_manager = manager.Manager()
155-
while True:
155+
max_attempts = 5
156+
for attempt_index in range(max_attempts):
156157
try:
158+
logging.info(f"Elastic attempt {attempt_index + 1}/{max_attempts}")
157159
trainer: SpmdTrainer = trainer_config.instantiate(parent=None)
158160
prng_key = jax.random.PRNGKey(seed=FLAGS.trainer_prng_seed)
159161
output = trainer.run(prng_key)

0 commit comments

Comments
 (0)