We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 9f4cb70 commit 7163eebCopy full SHA for 7163eeb
axlearn/common/launch_trainer.py
@@ -152,8 +152,10 @@ def _run_trainer_impl(trainer_config: SpmdTrainer.Config) -> Any:
152
# pylint: disable-next=import-error,import-outside-toplevel
153
from pathwaysutils.elastic import manager
154
elastic_manager = manager.Manager()
155
- while True:
+ max_attempts = 5
156
+ for attempt_index in range(max_attempts):
157
try:
158
+ logging.info(f"Elastic attempt {attempt_index + 1}/{max_attempts}")
159
trainer: SpmdTrainer = trainer_config.instantiate(parent=None)
160
prng_key = jax.random.PRNGKey(seed=FLAGS.trainer_prng_seed)
161
output = trainer.run(prng_key)
0 commit comments