apple · samos123 · Jun 20, 2025 · Jun 20, 2025 · May 22, 2025 · Jun 20, 2025
@@ -83,7 +83,7 @@ ENTRYPOINT ["/opt/apache/beam/boot"]
 
 FROM base AS tpu
 
-ARG EXTRAS=
+ARG EXTRAS=orbax
 
 ENV UV_FIND_LINKS=https://storage.googleapis.com/jax-releases/libtpu_releases.html
 # Ensure we install the TPU version, even if building locally.

@@ -134,7 +134,11 @@ def _build_jobset(self) -> Nested[Any]:
         return dict(
             metadata=dict(name=cfg.name, annotations=annotations),
             spec=dict(
-                failurePolicy=dict(maxRestarts=cfg.max_tries - 1),
+                failurePolicy=dict(
+                    maxRestarts=cfg.max_tries - 1,
+                    restartStrategy="BlockingRecreate"
+                    # maxRestarts=cfg.max_tries - 1,
+                ),
                 replicatedJobs=self._builder(),
             ),
         )

@@ -451,6 +451,9 @@ def _build_container(self) -> Nested[Any]:
         if cfg.enable_tpu_ici_resiliency is not None:
             env_vars["ENABLE_ICI_RESILIENCY"] = str(cfg.enable_tpu_ici_resiliency).lower()
 
+        env_vars["TPU_PREMAPPED_BUFFER_SIZE"] = "137438953472"
+        env_vars["TPU_PREMAPPED_BUFFER_TRANSFER_THRESHOLD_BYTES"] = "137438953472"
+
         resources = {"limits": {"google.com/tpu": system.chips_per_vm}}
         # Set request memory by host machine type.
         machine_memory_gi = GCE_MACHINE_TYPE_TO_MEMORY_CHARACTERISTICS.get(
@@ -690,7 +693,7 @@ def _build_pod(self) -> Nested[Any]:
 
         spec = dict(
             # NOTE: Don't set hostNetwork or dnsPolicy for compat with Workload Identity.
-            terminationGracePeriodSeconds=60,
+            terminationGracePeriodSeconds=300,
             # Fail if any pod fails, and allow retries to happen at JobSet level.
             restartPolicy="Never",
             # https://kubernetes.io/docs/tasks/network/customize-hosts-file-for-pods/#adding-additional-entries-with-hostaliases

@@ -13,9 +13,12 @@
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import jax
+import numpy as np
 import orbax.checkpoint as ocp
 import tensorflow as tf
 from absl import logging
+from orbax.checkpoint._src.metadata import array_metadata_store as array_metadata_store_lib
+from orbax.checkpoint._src.serialization.type_handlers import ArrayHandler
 
 from axlearn.common import utils
 from axlearn.common.checkpointer import (
@@ -187,15 +190,18 @@ class Config(BaseCheckpointer.Config):
 
         Attributes:
             keep_last_n: Keep this many past ckpts.
+            keep_every_n_steps: If set, keep a checkpoint every n steps.
             validation_type: Checkpoint validation during restore.
             async_timeout_secs: Timeout for async barrier in seconds.
         """
 
         keep_last_n: int = 1
+        keep_every_n_steps: Optional[int] = None
         validation_type: CheckpointValidationType = CheckpointValidationType.EXACT
         async_timeout_secs: int = 300
         max_concurrent_save_gb: Optional[int] = None
         max_concurrent_restore_gb: Optional[int] = None
+        enable_single_replica_ckpt_restoring: bool = True
 
     @classmethod
     def checkpoint_paths(cls, base_dir: str) -> List[str]:
@@ -237,6 +243,7 @@ def save_fn_with_summaries(step: int, last_saved_step: Optional[int]) -> bool:
             options=ocp.CheckpointManagerOptions(
                 create=True,
                 max_to_keep=cfg.keep_last_n,
+                keep_period=cfg.keep_every_n_steps,
                 enable_async_checkpointing=True,
                 step_name_format=self._name_format,
                 should_save_fn=save_fn_with_summaries,
@@ -321,11 +328,33 @@ def restore(
 
         cfg: OrbaxCheckpointer.Config = self.config
 
+        if cfg.enable_single_replica_ckpt_restoring:
+            array_handler = ocp.type_handlers.SingleReplicaArrayHandler(
+                replica_axis_index=0,
+                broadcast_memory_limit_bytes=1024 * 1024 * 1000,  # 1000 MB limit
+            )
+            ocp.type_handlers.register_type_handler(jax.Array, array_handler, override=True)
+
         def _restore_args(x: Any) -> ocp.RestoreArgs:
             if isinstance(x, (Tensor, TensorSpec)):
-                return ocp.checkpoint_utils.construct_restore_args(
-                    jax.ShapeDtypeStruct(shape=x.shape, dtype=x.dtype, sharding=x.sharding)
-                )
+                if cfg.enable_single_replica_ckpt_restoring:
+                    pspec = x.sharding.spec
+                    mesh = x.sharding.mesh
+                    replica_axis_index = 0
+                    replica_devices = _replica_devices(mesh.devices, replica_axis_index)
+                    replica_mesh = jax.sharding.Mesh(replica_devices, mesh.axis_names)
+                    single_replica_sharding = jax.sharding.NamedSharding(replica_mesh, pspec)
+
+                    return ocp.type_handlers.SingleReplicaArrayRestoreArgs(
+                        sharding=jax.sharding.NamedSharding(mesh, pspec),
+                        single_replica_sharding=single_replica_sharding,
+                        global_shape=x.shape,
+                        dtype=x.dtype,
+                    )
+                else:
+                    return ocp.checkpoint_utils.construct_restore_args(
+                        jax.ShapeDtypeStruct(shape=x.shape, dtype=x.dtype, sharding=x.sharding)
+                    )
             elif isinstance(x, tf.data.Iterator):
                 return _TfIteratorHandler.RestoreArgs(item=x)
             elif _GRAIN_INSTALLED and isinstance(x, _GrainIterator):
@@ -349,6 +378,13 @@ def _restore_args(x: Any) -> ocp.RestoreArgs:
                 raise ValueError(f"Failed to restore at step {step}.") from e
             logging.info("Could not find any completed checkpoints under %s: %s", cfg.dir, e)
             return None, state  # Return the input state.
+        finally:
+            if cfg.enable_single_replica_ckpt_restoring:
+                ocp.type_handlers.register_type_handler(
+                    jax.Array,
+                    ArrayHandler(array_metadata_store=array_metadata_store_lib.Store()),
+                    override=True,
+                )
 
         restored_index = composite_state["index"]
         restored_state = composite_state["state"]
@@ -375,3 +411,29 @@ def wait_until_finished(self):
     def stop(self, *, has_exception: bool = False):
         """See `BaseCheckpointer.stop` for details."""
         self._manager.close()
+
+
+def _find_idx(array: np.ndarray, replica_axis_idx: int):
+    """Returns the index along given dimension that the current host belongs to."""
+    idx = None
+    for idx, val in np.ndenumerate(array):
+        if val.process_index == jax.process_index():
+            break
+    return idx[replica_axis_idx]
+
+
+def _replica_devices(device_array: np.ndarray, replica_axis_idx: int):
+    """Returns the devices from the replica that current host belongs to.
+
+    Replicas are assumed to be restricted to the first axis.
+
+    Args:
+      device_array: devices of the mesh that can be obtained by mesh.devices()
+      replica_axis_idx: axis dimension along which replica is taken
+
+    Returns:
+      devices inside the replica that current host is in
+    """
+    idx = _find_idx(device_array, replica_axis_idx)
+    replica_result = np.take(device_array, idx, axis=replica_axis_idx)
+    return np.expand_dims(replica_result, axis=replica_axis_idx)
@@ -90,6 +90,9 @@ def setup(spec: str):
     FLAGS.process_id = info.inv_proc_id
     FLAGS.distributed_coordinator = info.address
     FLAGS.experimental_orbax_use_distributed_process_id = True
+    # Required for case when slices swap and ici_dp=2 or higher.
+    # PR that introduced this flag: https://github.com/google/orbax/pull/2222
+    FLAGS.experimental_use_distributed_id_for_mesh_consistency = False
     yield
 
 
@@ -314,6 +317,8 @@ def _init_consistent_proc_ids(
     # Then, rank 0 assigns inv_proc_id for worker that's missing their inv_proc_id and find the
     # coordinator address.
     if local_proc_info.cur_proc_id == 0:
+        jax_devices = [dev.id for dev in jax.devices()]
+        logging.info("jax_devices=%s", jax_devices)
         ids = client.key_value_dir_get(key_prefix)
         proc_infos: list[_ProcessInfo] = []
 

@@ -58,7 +58,12 @@ def default_xla_options(
             # cause the step time to be double. You should increase this
             # further if you see "Allocator failed to allocate". A feature
             # to dynamically allocate may come later: b/380514965
-            megascale_grpc_premap_memory_bytes=17179869184,
+            # Needed for orbax emergency checkpointer
+            # Needed for decent perf
+            megascale_grpc_premap_memory_bytes=137438953472,
+            # needed for restore consistent hash
+            megascale_jax_offset_launch_id_by_module_name=True,
+            megascale_jax_use_device_set_based_launch_id=False,
             # Flag controlling the maximum number of overlapping host offloadings.
             xla_tpu_host_transfer_overlap_limit=24,
             # Flag controlling the maximum number of overlapping cross-DCN send/recv.

@@ -613,7 +613,7 @@ def run(
                         )
                         self.vlog(3, "Done step %s", self.step)
                         num_steps += 1
-                        if num_steps % 100 == 0:
+                        if num_steps % 1 == 0:
                             now = time.perf_counter()
                             average_step_time = (now - start_time) / num_steps
                             self._step_log("Average step time: %s seconds", average_step_time)
@@ -927,42 +927,40 @@ def restore_checkpoint(self, restore_step: Optional[int] = None) -> Optional[int
                 **ckpt_state_spec, input_iter=iter(self.input.dataset())
             )
             restore_input_iter = cfg.save_input_iterator
-            try:
-                # Try to restore with `input_iter`.
-                step, ckpt_state = self.checkpointer.restore(
-                    step=restore_step,
-                    state=(
-                        ckpt_state_spec_with_input_iter if restore_input_iter else ckpt_state_spec
-                    ),
-                )
-                if step is not None:
-                    self.vlog(
-                        0,
-                        "Restored checkpoint at %s with restore_input_iter=%s",
-                        step,
-                        restore_input_iter,
-                    )
-            except ValueError as e:
-                logging.warning(
-                    "Attempt to restore checkpoint with restore_input_iter=%s failed: %s",
+            # try:
+            # Try to restore with `input_iter`.
+            step, ckpt_state = self.checkpointer.restore(
+                step=restore_step,
+                state=(ckpt_state_spec_with_input_iter if restore_input_iter else ckpt_state_spec),
+            )
+            if step is not None:
+                self.vlog(
+                    0,
+                    "Restored checkpoint at %s with restore_input_iter=%s",
+                    step,
                     restore_input_iter,
-                    e,
-                )
-                # Restore with a different restore_input_iter setting.
-                restore_input_iter = not restore_input_iter
-                step, ckpt_state = self.checkpointer.restore(
-                    step=restore_step,
-                    state=(
-                        ckpt_state_spec_with_input_iter if restore_input_iter else ckpt_state_spec
-                    ),
                 )
-                if step is not None:
-                    self.vlog(
-                        0,
-                        "Restored checkpoint at %s with restore_input_iter=%s",
-                        step,
-                        restore_input_iter,
-                    )
+            # except ValueError as e:
+            #     logging.warning(
+            #         "Attempt to restore checkpoint with restore_input_iter=%s failed: %s",
+            #         restore_input_iter,
+            #         e,
+            #     )
+            #     # Restore with a different restore_input_iter setting.
+            #     restore_input_iter = not restore_input_iter
+            #     step, ckpt_state = self.checkpointer.restore(
+            #         step=restore_step,
+            #         state=(
+            #             ckpt_state_spec_with_input_iter if restore_input_iter else ckpt_state_spec
+            #         ),
+            #     )
+            #     if step is not None:
+            #         self.vlog(
+            #             0,
+            #             "Restored checkpoint at %s with restore_input_iter=%s",
+            #             step,
+            #             restore_input_iter,
+            #         )
             if step is not None:
                 self._step = step
                 self._trainer_state = TrainerState(
@@ -1097,7 +1095,7 @@ def _run_step(
             # Run the compiled function.
             self._trainer_state, outputs = compiled_train_step_fn(self.trainer_state, input_batch)
 
-        if self.step % 100 == 0 or 0 <= self.step <= 5:
+        if self.step % 1 == 0 or 0 <= self.step <= 5:
             self._step_log(
                 "loss=%s aux=%s",
                 outputs["loss"],

@@ -643,6 +643,7 @@ def get_trainer_config_fn(
     keep_every_n_steps: int = 50_000,
     save_every_n_steps: Optional[int] = None,
     init_state_builder: Optional[state_builder.Builder.Config] = None,
+    checkpointer: str = "",
 ) -> TrainerConfigFn:
     """Builds a TrainerConfigFn according to the model and input specs.
 
@@ -709,13 +710,56 @@ def config_fn() -> InstantiableConfig:
                 )
             )
             cfg.evalers[name] = evaler_cfg
+
         # Summaries and checkpoints.
-        cfg.checkpointer.save_policy = config_for_function(every_n_steps_and_last_policy).set(
-            n=save_every_n_steps or min(eval_every_n_steps, 5_000),
-            max_step=max_step,
-        )
-        cfg.checkpointer.keep_every_n_steps = min(max_step, keep_every_n_steps)
-        cfg.checkpointer.keep_last_n = 3
+        calculated_save_every_n_steps = save_every_n_steps or min(eval_every_n_steps, 100)
+
+        if not checkpointer:
+            cfg.checkpointer.save_policy = config_for_function(every_n_steps_and_last_policy).set(
+                n=calculated_save_every_n_steps,
+                max_step=max_step,
+            )
+            cfg.checkpointer.keep_every_n_steps = min(max_step, keep_every_n_steps)
+            cfg.checkpointer.keep_last_n = 3
+        elif checkpointer == "OrbaxEmergencyCheckpointer":
+            # Prevent global dependency on Orbax.
+            # pylint: disable-next=import-outside-toplevel
+            from axlearn.common.checkpointer_orbax_emergency import OrbaxEmergencyCheckpointer
+
+            ckpt_config: OrbaxEmergencyCheckpointer.Config = (
+                OrbaxEmergencyCheckpointer.default_config()
+            )
+            ckpt_config.save_policy = config_for_function(every_n_steps_and_last_policy).set(
+                n=calculated_save_every_n_steps,
+                max_step=max_step,
+            )
+            ckpt_config.local_save_policy = config_for_function(every_n_steps_and_last_policy).set(
+                n=20,
+                max_step=max_step,
+            )
+            ckpt_config.local_dir = "/host-tmp/checkpoints"
+            ckpt_config.keep_every_n_steps = min(max_step, keep_every_n_steps)
+            ckpt_config.keep_last_n = 3
+            ckpt_config.replica_axis_index = 1
+            cfg.checkpointer = ckpt_config
+        elif checkpointer == "OrbaxRegularCheckpointer":
+            # Prevent global dependency on Orbax.
+            # pylint: disable-next=import-outside-toplevel
+            from axlearn.common.checkpointer_orbax import OrbaxCheckpointer
+
+            ckpt_config: OrbaxCheckpointer.Config = OrbaxCheckpointer.default_config()
+            ckpt_config.save_policy = config_for_function(every_n_steps_and_last_policy).set(
+                n=calculated_save_every_n_steps,
+                max_step=max_step,
+            )
+            ckpt_config.keep_every_n_steps = min(max_step, keep_every_n_steps)
+            ckpt_config.keep_last_n = 3
+            cfg.checkpointer = ckpt_config
+
+        # Save the data iterator as part of the checkpointing process.
+        # default is false.
+        # cfg.save_input_iterator = True
+
         cfg.summary_writer.write_every_n_steps = min(eval_every_n_steps, 100)
         cfg.summary_writer.max_queue = 1000
         if len(mesh_axis_names) != len(mesh_shape):