Adding basic elastic training

lukebaumann · lukebaumann · commit 9f4cb70a4cf7 · 2025-07-24T22:39:18.000Z
Pulling pathwaysutils from github
Added guards to only use fast-resume if the proxy backend is used.
Added the changes to the jobset for elastic training
Temporary changes to the configuration to decrease batch size
Adding a stop_trace to cancel any ongoing traces
Taking checkpoint every 20_000_000 steps
diff --git a/axlearn/cloud/gcp/pathways_utils.py b/axlearn/cloud/gcp/pathways_utils.py
@@ -295,6 +295,8 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
             f"--server_port={_PATHWAYS_PROXY_PORT}",
             f"--gcs_scratch_location={staging_location}",
             "--temporary_flags_for_debugging=temporary_flag_for_debugging_pipe_break_on_missing_keepalive=true",
+            # This should be made configurable
+            f"--num_elastic_slices={cfg.accelerator.num_replicas}",
         ]
         cmd_args.extend(xla_flags_from_options(self._xla_options).split())
 
@@ -566,14 +568,19 @@ def _build_pathways_worker_job(
         annotations.update(
             {"alpha.jobset.sigs.k8s.io/exclusive-topology": "cloud.google.com/gke-nodepool"}
         )
+        # Default value for suspend and resume.
+        # References:
+        # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
+        # backoffLimit = system.vms_per_slice * 4
+
+        # This backoffLimit is just for verifying elastic fast-resume
+        large_number = 1000
+        backoffLimit = system.vms_per_slice * 4 * large_number
 
         spec = dict(
             parallelism=system.vms_per_slice,
             completions=system.vms_per_slice,
-            # Default value for suspend and resume.
-            # References:
-            # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
-            backoffLimit=system.vms_per_slice * 4,
+            backoffLimit=backoffLimit,
             template=self._build_pathways_worker_pod(pathways_worker_replicated_job_index),
         )
         worker_job = dict(
diff --git a/axlearn/common/launch_trainer.py b/axlearn/common/launch_trainer.py
@@ -148,9 +148,41 @@ def _run_trainer_impl(trainer_config: SpmdTrainer.Config) -> Any:
                 f,
             )
 
-    trainer: SpmdTrainer = trainer_config.instantiate(parent=None)
-    prng_key = jax.random.PRNGKey(seed=FLAGS.trainer_prng_seed)
-    return trainer.run(prng_key)
+    if FLAGS.jax_backend == "proxy":
+        # pylint: disable-next=import-error,import-outside-toplevel
+        from pathwaysutils.elastic import manager
+        elastic_manager = manager.Manager()
+        while True:
+            try:
+                trainer: SpmdTrainer = trainer_config.instantiate(parent=None)
+                prng_key = jax.random.PRNGKey(seed=FLAGS.trainer_prng_seed)
+                output = trainer.run(prng_key)
+                break
+            except jax.errors.JaxRuntimeError as error:
+                if not elastic_manager.is_error_due_to_slice_down(error):
+                    raise
+                try:
+                    logging.info("Trying to clean up ongoing traces")
+                    jax.profiler.stop_trace()
+                    logging.info("Successfully cleaned up ongoing traces")
+                except (RuntimeError, ValueError) as e:
+                  logging.info("No ongoing traces to clean up")
+                except Exception as e:
+                  logging.exception("Error trying to clean up ongoing traces")
+                  raise
+
+                jax.clear_caches()
+                for array in jax.live_arrays():
+                  array.delete()
+
+                ten_minutes = 10 * 60
+                elastic_manager.wait_for_slices(timeout=ten_minutes)
+    else:
+        trainer: SpmdTrainer = trainer_config.instantiate(parent=None)
+        prng_key = jax.random.PRNGKey(seed=FLAGS.trainer_prng_seed)
+        output = trainer.run(prng_key)
+
+    return output
 
 
 def run_trainer(trainer_config: SpmdTrainer.Config) -> Any:
diff --git a/axlearn/experiments/text/gpt/fuji.py b/axlearn/experiments/text/gpt/fuji.py
@@ -384,6 +384,7 @@ def get_trainer_kwargs(
             max_sequence_length=max_sequence_length,
             train_batch_size=len(jax.devices()),
             max_step=max_step,
+            save_every_n_steps=100,
             mesh_shape=mesh_shape_from_axes(data=-1, fsdp=8),
             mesh_rules=(
                 # Step time:
@@ -848,7 +849,7 @@ def get_trainer_kwargs(
             max_sequence_length=max_sequence_length,
             train_batch_size=train_batch_size, # number of devices times 4 chips per device times 4096 samples per chip # train_batch_size,
             max_step=10_000, # max_step,
-            save_every_n_steps=1000,
+            save_every_n_steps=20_000_000,
             mesh_shape=mesh_shape_from_axes(data=-1, fsdp=64, model=4),
             mesh_rules=(
                 (
diff --git a/pyproject.toml b/pyproject.toml
@@ -114,7 +114,7 @@ tpu = [
 pathways-tpu = [
     "axlearn[gcp]",
     "jax==0.5.3",  # must be >=0.4.19 for compat with v5p.
-    "pathwaysutils==0.1.1",
+    "pathwaysutils @ git+https://github.com/AI-Hypercomputer/pathways-utils",
 ]
 # Vertex AI tensorboard. TODO(markblee): Merge with `gcp`.
 vertexai_tensorboard = [

Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,7 @@ tpu = [`
`114`	`114`	`pathways-tpu = [`
`115`	`115`	`"axlearn[gcp]",`
`116`	`116`	`"jax==0.5.3", # must be >=0.4.19 for compat with v5p.`
`117`		`- "pathwaysutils==0.1.1",`
	`117`	`+ "pathwaysutils @ git+https://github.com/AI-Hypercomputer/pathways-utils",`
`118`	`118`	`]`
`119`	`119`	# Vertex AI tensorboard. TODO(markblee): Merge with `gcp`.
`120`	`120`	`vertexai_tensorboard = [`