google
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/p2p_checkpoint_manager_benchmark.yaml‎
Lines changed: 52 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/p2p_checkpoint_manager_benchmark.yaml‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core.py‎
Lines changed: 6 additions & 1 deletion b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/directory_setup.py‎
Lines changed: 2 additions & 2 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/directory_setup.py‎
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,52 @@
+# The name for the entire test suite run.
+suite_name: "P2P CheckpointManager Benchmark"
+
+mesh_configs:
+  - mesh_axes: ["data", "stage", "fsdp", "fsdp_transpose", "sequence", "tensor", "expert", "autoregressive"]
+    # ICI: Within a slice. Assuming 8 devices per slice.
+    # DCN: Across slices.
+    ici_parallelism: {"fsdp": 1, "tensor": 1, "data": 1}
+    dcn_parallelism: {"data": 1} # num_slices on the axis at replica_axis_index
+    process_is_granule: true
+  - mesh_axes: ["data", "model", "tensor", "fsdp"]
+    ici_parallelism: {"data": 1, "model": 1}
+    dcn_parallelism: {"data": 4, "model": 1}
+  - mesh_axes: ["data", "model", "tensor", "fsdp"]
+    ici_parallelism: {"data": 1, "model": 16}
+    dcn_parallelism: {"data": 4, "model": 1}
+    allow_split_physical_axes: true
+  - mesh_axes: ["data", "model", "tensor", "fsdp"]
+    ici_parallelism: {"data": 2, "model": 8}
+    dcn_parallelism: {"data": 2, "model": 1}
+    allow_split_physical_axes: true
+  - mesh_axes: ["data", "model", "tensor", "fsdp"]
+    ici_parallelism: {"data": 2, "model": 4}
+    dcn_parallelism: {"data": 2, "model": 1}
+    allow_split_physical_axes: true
+
+checkpoint_config:
+  spec:
+    a_1d: {dtype: "float32", shape: [32], sharding: [null]}
+    b_1d: {dtype: "float32", shape: [32], sharding: ["tensor"]}
+    c_2d: {dtype: "float32", shape: [32, 32], sharding: [null, "tensor"]}
+    d_2d: {dtype: "float32", shape: [32, 32], sharding: ["tensor", null]}
+    e_2d: {dtype: "float32", shape: [32, 32], sharding: ["tensor", "fsdp"]}
+    f_2d: {dtype: "float32", shape: [32, 32], sharding: ["fsdp", "tensor"]}
+    g_2d: {dtype: "float32", shape: [32, 32], sharding: [null, null]}
+    h_3d: {dtype: "float32", shape: [32, 32, 32], sharding: ["tensor", null, "fsdp"]}
+    i_3d: {dtype: "float32", shape: [32, 32, 32], sharding: [null, null, "tensor"]}
+    j_3d: {dtype: "float32", shape: [32, 32, 32], sharding: [null, null, "fsdp"]}
+    k_3d: {dtype: "float32", shape: [32, 32, 32], sharding: [null, null, null]}
+    custom_array: {dtype: "float32", shape: [8192, 64], sharding: ["tensor", null]}
+
+benchmarks:
+  - generator: "orbax.checkpoint._src.testing.benchmarks.p2p_checkpoint_manager_benchmark.P2pCheckpointManagerBenchmark"
+    options:
+      persistent_save_interval_steps: [2]
+      persistent_max_to_keep: [5]
+      local_save_interval_steps: [2]
+      local_max_to_keep: 2
+      replica_axis_index: 0
+      train_steps: 5
+      experimental_orbax_use_distributed_process_id: true
+      experimental_use_distributed_id_for_mesh_consistency: true
@@ -159,6 +159,11 @@ def run(self, repeat_index: int | None = None) -> TestResult:
     path = directory_setup.setup_test_directory(
         self.name, self.output_dir, repeat_index
     )
+    local_path = None
+    if self.local_directory is not None:
+      local_path = epath.Path(self.local_directory) / name
+      if repeat_index is not None:
+        local_path = local_path / f"repeat_{repeat_index}"
 
     with benchmark_metrics.measure(
         "sync_global_processes:benchmark:setup_test_directory"
@@ -185,7 +190,7 @@ def run(self, repeat_index: int | None = None) -> TestResult:
         options=self.options,
         mesh=self.mesh,
         repeat_index=repeat_index,
-        local_path=self.local_directory,
+        local_path=local_path,
     )
 
     test_context_summary = self._build_test_context_summary(context)
 
@@ -40,8 +40,8 @@ def setup_test_directory(
     path = path / f"repeat_{repeat_index}"
   logging.info("Setting up test directory at: %s", path)
   if jax.process_index() == 0:
-    if path.exists():
+    if path.exists() and not base_path.startswith("gs://"):
       logging.warning("Test directory %s already exists. Deleting it.", path)
       path.rmtree()
-    path.mkdir(parents=True, exist_ok=False)
+    path.mkdir(parents=True, exist_ok=True)
   return path