make naming more descriptive, update default in test util

ananthsub · ananthsub · commit c912f27ae6ac · 2026-02-26T20:09:33.000-08:00
Signed-off-by: Ananth Subramaniam &lt;ansubramania@nvidia.com&gt;
diff --git a/src/megatron/bridge/training/checkpointing.py b/src/megatron/bridge/training/checkpointing.py
@@ -652,7 +652,7 @@ def save_checkpoint(
                     save_strategy = TorchDistSaveShardedStrategy(
                         "torch_dist",
                         1,
-                        thread_count=ckpt_cfg.thread_count,
+                        thread_count=ckpt_cfg.storage_writers_per_rank,
                     )
                 else:
                     save_strategy = get_default_save_sharded_strategy(ckpt_cfg.ckpt_format)
diff --git a/src/megatron/bridge/training/config.py b/src/megatron/bridge/training/config.py
@@ -852,9 +852,9 @@ class CheckpointConfig:
     use_checkpoint_args: bool = False
     """Override any command line arguments with arguments from the checkpoint"""
 
-    thread_count: int = 1
-    """Number of threads to use during saving (torch_dist format only).
-    Affects the number of checkpoint files: saving_ranks * thread_count."""
+    storage_writers_per_rank: int = 1
+    """Number of storage writers per rank for torch_dist checkpoint format.
+    Affects the number of checkpoint files: saving_ranks * storage_writers_per_rank."""
 
     exit_on_missing_checkpoint: bool = False
     """If 'load' is set, but checkpoint is not found (e.g., path typo), then exit instead of random initialization."""
diff --git a/tests/functional_tests/recipes/test_gpt_oss_recipes_finetune.py b/tests/functional_tests/recipes/test_gpt_oss_recipes_finetune.py
@@ -276,7 +276,7 @@ def test_gpt_oss_finetune_recipes(
                 config.checkpoint.save,
                 5,
                 ckpt_format=config.checkpoint.ckpt_format,
-                thread_count=config.checkpoint.thread_count,
+                storage_writers_per_rank=config.checkpoint.storage_writers_per_rank,
             )
 
         finally:
diff --git a/tests/functional_tests/recipes/test_llama_recipes_distill_3b-1b.py b/tests/functional_tests/recipes/test_llama_recipes_distill_3b-1b.py
@@ -166,7 +166,7 @@ def run_distill_recipe_test(
             config.checkpoint.save,
             10,
             ckpt_format=config.checkpoint.ckpt_format,
-            thread_count=config.checkpoint.thread_count,
+            storage_writers_per_rank=config.checkpoint.storage_writers_per_rank,
         )
 
     finally:
diff --git a/tests/functional_tests/recipes/test_nemotronh_recipes_finetune.py b/tests/functional_tests/recipes/test_nemotronh_recipes_finetune.py
@@ -304,7 +304,7 @@ def test_nemotron_nano_v2_finetune_recipes(
                 config.checkpoint.save,
                 config.train.train_iters,
                 ckpt_format=config.checkpoint.ckpt_format,
-                thread_count=config.checkpoint.thread_count,
+                storage_writers_per_rank=config.checkpoint.storage_writers_per_rank,
             )
 
         finally:
@@ -582,7 +582,7 @@ def test_nemotron_3_nano_finetune_recipes(
                 config.checkpoint.save,
                 config.train.train_iters,
                 ckpt_format=config.checkpoint.ckpt_format,
-                thread_count=config.checkpoint.thread_count,
+                storage_writers_per_rank=config.checkpoint.storage_writers_per_rank,
             )
 
         finally:
diff --git a/tests/functional_tests/recipes/utils.py b/tests/functional_tests/recipes/utils.py
@@ -123,7 +123,7 @@ def run_pretrain_recipe_test(
             config.checkpoint.save,
             10,
             ckpt_format=config.checkpoint.ckpt_format,
-            thread_count=config.checkpoint.thread_count,
+            storage_writers_per_rank=config.checkpoint.storage_writers_per_rank,
         )
 
     finally:
@@ -291,7 +291,7 @@ def run_pretrain_vl_recipe_test(
             config.checkpoint.save,
             config.train.train_iters,
             ckpt_format=config.checkpoint.ckpt_format,
-            thread_count=config.checkpoint.thread_count,
+            storage_writers_per_rank=config.checkpoint.storage_writers_per_rank,
         )
 
     finally:
diff --git a/tests/functional_tests/training/test_finetune_lora.py b/tests/functional_tests/training/test_finetune_lora.py
@@ -91,7 +91,7 @@ def test_pretrain_then_lora_finetune(self, tmp_path):
                 pretrain_checkpoint_dir,
                 pretrain_iters,
                 ckpt_format=pretrain_cfg.checkpoint.ckpt_format,
-                thread_count=pretrain_cfg.checkpoint.thread_count,
+                storage_writers_per_rank=pretrain_cfg.checkpoint.storage_writers_per_rank,
             )
 
             # Create LoRA config and run finetuning
@@ -103,7 +103,7 @@ def test_pretrain_then_lora_finetune(self, tmp_path):
                 lora_checkpoint_dir,
                 lora_iters,
                 ckpt_format=lora_cfg.checkpoint.ckpt_format,
-                thread_count=lora_cfg.checkpoint.thread_count,
+                storage_writers_per_rank=lora_cfg.checkpoint.storage_writers_per_rank,
             )
             verify_peft_checkpoint_smaller(pretrain_checkpoint_dir, lora_checkpoint_dir, pretrain_iters, lora_iters)
 
@@ -143,7 +143,7 @@ def test_lora_save_and_resume(self, tmp_path):
                 pretrain_checkpoint_dir,
                 pretrain_iters,
                 ckpt_format=pretrain_cfg.checkpoint.ckpt_format,
-                thread_count=pretrain_cfg.checkpoint.thread_count,
+                storage_writers_per_rank=pretrain_cfg.checkpoint.storage_writers_per_rank,
             )
 
             # Second run: LoRA finetuning initial phase (will be "interrupted")
@@ -165,7 +165,7 @@ def test_lora_save_and_resume(self, tmp_path):
                 lora_checkpoint_dir,
                 initial_lora_iters,
                 ckpt_format=lora_initial_cfg.checkpoint.ckpt_format,
-                thread_count=lora_initial_cfg.checkpoint.thread_count,
+                storage_writers_per_rank=lora_initial_cfg.checkpoint.storage_writers_per_rank,
             )
 
             # Third run: Resume LoRA finetuning from checkpoint (adapter-only states)
@@ -189,7 +189,7 @@ def test_lora_save_and_resume(self, tmp_path):
                 lora_checkpoint_dir,
                 total_lora_iters,
                 ckpt_format=lora_resume_cfg.checkpoint.ckpt_format,
-                thread_count=lora_resume_cfg.checkpoint.thread_count,
+                storage_writers_per_rank=lora_resume_cfg.checkpoint.storage_writers_per_rank,
             )
             verify_peft_checkpoint_smaller(
                 pretrain_checkpoint_dir, lora_checkpoint_dir, pretrain_iters, initial_lora_iters
@@ -227,7 +227,7 @@ def test_lora_finetune_with_packed_sequences(self, tmp_path):
                 pretrain_checkpoint_dir,
                 pretrain_iters,
                 ckpt_format=pretrain_cfg.checkpoint.ckpt_format,
-                thread_count=pretrain_cfg.checkpoint.thread_count,
+                storage_writers_per_rank=pretrain_cfg.checkpoint.storage_writers_per_rank,
             )
 
             # Create LoRA config with packed sequences and run finetuning
@@ -248,7 +248,7 @@ def test_lora_finetune_with_packed_sequences(self, tmp_path):
                 lora_checkpoint_dir,
                 lora_iters,
                 ckpt_format=lora_cfg.checkpoint.ckpt_format,
-                thread_count=lora_cfg.checkpoint.thread_count,
+                storage_writers_per_rank=lora_cfg.checkpoint.storage_writers_per_rank,
             )
             verify_peft_checkpoint_smaller(pretrain_checkpoint_dir, lora_checkpoint_dir, pretrain_iters, lora_iters)
 
diff --git a/tests/functional_tests/training/test_megatron_fsdp.py b/tests/functional_tests/training/test_megatron_fsdp.py
@@ -315,7 +315,7 @@ def test_fsdp_pretrain_with_checkpoint(self, tmp_path):
                 checkpoint_dir,
                 total_iters,
                 ckpt_format=cfg.checkpoint.ckpt_format,
-                thread_count=cfg.checkpoint.thread_count,
+                storage_writers_per_rank=cfg.checkpoint.storage_writers_per_rank,
             )
 
         finally:
@@ -364,7 +364,7 @@ def test_fsdp_pretrain_save_resume(self, tmp_path):
                 checkpoint_dir,
                 checkpoint_iters,
                 ckpt_format=cfg_first.checkpoint.ckpt_format,
-                thread_count=cfg_first.checkpoint.thread_count,
+                storage_writers_per_rank=cfg_first.checkpoint.storage_writers_per_rank,
             )
 
             torch.distributed.barrier()
@@ -390,7 +390,7 @@ def test_fsdp_pretrain_save_resume(self, tmp_path):
                 checkpoint_dir,
                 total_iters,
                 ckpt_format=cfg_second.checkpoint.ckpt_format,
-                thread_count=cfg_second.checkpoint.thread_count,
+                storage_writers_per_rank=cfg_second.checkpoint.storage_writers_per_rank,
             )
 
         finally:
diff --git a/tests/functional_tests/training/test_pretrain_resume.py b/tests/functional_tests/training/test_pretrain_resume.py
@@ -167,7 +167,7 @@ def test_pretrain_save_load(self, tmp_path):
                 checkpoint_dir,
                 checkpoint_iters,
                 ckpt_format=cfg_first.checkpoint.ckpt_format,
-                thread_count=cfg_first.checkpoint.thread_count,
+                storage_writers_per_rank=cfg_first.checkpoint.storage_writers_per_rank,
             )
 
             torch.distributed.barrier()
@@ -257,7 +257,7 @@ def test_pretrain_save_load(self, tmp_path):
                 checkpoint_dir,
                 total_iters,
                 ckpt_format=cfg_second.checkpoint.ckpt_format,
-                thread_count=cfg_second.checkpoint.thread_count,
+                storage_writers_per_rank=cfg_second.checkpoint.storage_writers_per_rank,
             )
 
         finally:
diff --git a/tests/functional_tests/training/test_seqpacking_cp_example.py b/tests/functional_tests/training/test_seqpacking_cp_example.py
@@ -103,7 +103,7 @@ def test_sft_example_runs_with_cp_and_packing(self, tmp_path):
                 checkpoint_dir,
                 cfg.train.train_iters,
                 ckpt_format=cfg.checkpoint.ckpt_format,
-                thread_count=cfg.checkpoint.thread_count,
+                storage_writers_per_rank=cfg.checkpoint.storage_writers_per_rank,
             )
         finally:
             clear_directories(shared_dir)
diff --git a/tests/functional_tests/training/test_sft.py b/tests/functional_tests/training/test_sft.py
@@ -83,7 +83,7 @@ def test_pretrain_then_finetune(self, tmp_path):
                 pretrain_checkpoint_dir,
                 pretrain_iters,
                 ckpt_format=pretrain_cfg.checkpoint.ckpt_format,
-                thread_count=pretrain_cfg.checkpoint.thread_count,
+                storage_writers_per_rank=pretrain_cfg.checkpoint.storage_writers_per_rank,
             )
 
             # Create finetune config and run (lower LR, different seed, use pretrained checkpoint)
@@ -101,7 +101,7 @@ def test_pretrain_then_finetune(self, tmp_path):
                 finetune_checkpoint_dir,
                 finetune_iters,
                 ckpt_format=finetune_cfg.checkpoint.ckpt_format,
-                thread_count=finetune_cfg.checkpoint.thread_count,
+                storage_writers_per_rank=finetune_cfg.checkpoint.storage_writers_per_rank,
             )
 
         finally:
diff --git a/tests/functional_tests/utils.py b/tests/functional_tests/utils.py
@@ -103,16 +103,20 @@ def clear_directories(path: str) -> None:
 
 
 def verify_checkpoint_files(
-    checkpoint_dir: str, iteration_count: int, ckpt_format: str = "torch_dist", thread_count: int = 2
+    checkpoint_dir: str,
+    iteration_count: int,
+    ckpt_format: str = "torch_dist",
+    storage_writers_per_rank: int = 1,
 ) -> None:
     """Verify that checkpoint files were created correctly.
 
     Args:
         checkpoint_dir: Directory containing checkpoints
         iteration_count: Expected iteration number for the checkpoint
         ckpt_format: Checkpoint format ("torch_dist", "fsdp_dtensor", etc.)
-        thread_count: Threads used during save (torch_dist only). Pass config.checkpoint.thread_count.
-            Affects expected file count: world_size * thread_count.
+        storage_writers_per_rank: Storage writers per rank (torch_dist only).
+            Pass config.checkpoint.storage_writers_per_rank.
+            Affects expected file count: world_size * storage_writers_per_rank.
     """
     if torch.distributed.is_initialized():
         torch.distributed.barrier()
@@ -143,7 +147,7 @@ def verify_checkpoint_files(
         distcp_files = [f for f in os.listdir(final_iter_dir) if f.endswith(".distcp")]
 
         if ckpt_format == "torch_dist":
-            num_expected_files = thread_count * torch.distributed.get_world_size()
+            num_expected_files = storage_writers_per_rank * torch.distributed.get_world_size()
         elif ckpt_format == "fsdp_dtensor":
             # fsdp_dtensor format creates .distcp files (one per rank)
             num_expected_files = torch.distributed.get_world_size()

Original file line number	Diff line number	Diff line change
`@@ -652,7 +652,7 @@ def save_checkpoint(`
`652`	`652`	`save_strategy = TorchDistSaveShardedStrategy(`
`653`	`653`	`"torch_dist",`
`654`	`654`	`1,`
`655`		`- thread_count=ckpt_cfg.thread_count,`
	`655`	`+ thread_count=ckpt_cfg.storage_writers_per_rank,`
`656`	`656`	`)`
`657`	`657`	`else:`
`658`	`658`	`save_strategy = get_default_save_sharded_strategy(ckpt_cfg.ckpt_format)`
Original file line number	Diff line number	Diff line change
`@@ -276,7 +276,7 @@ def test_gpt_oss_finetune_recipes(`
`276`	`276`	`config.checkpoint.save,`
`277`	`277`	`5,`
`278`	`278`	`ckpt_format=config.checkpoint.ckpt_format,`
`279`		`- thread_count=config.checkpoint.thread_count,`
	`279`	`+ storage_writers_per_rank=config.checkpoint.storage_writers_per_rank,`
`280`	`280`	`)`
`281`	`281`
`282`	`282`	`finally:`
Original file line number	Diff line number	Diff line change
`@@ -166,7 +166,7 @@ def run_distill_recipe_test(`
`166`	`166`	`config.checkpoint.save,`
`167`	`167`	`10,`
`168`	`168`	`ckpt_format=config.checkpoint.ckpt_format,`
`169`		`- thread_count=config.checkpoint.thread_count,`
	`169`	`+ storage_writers_per_rank=config.checkpoint.storage_writers_per_rank,`
`170`	`170`	`)`
`171`	`171`
`172`	`172`	`finally:`
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ def test_sft_example_runs_with_cp_and_packing(self, tmp_path):`
`103`	`103`	`checkpoint_dir,`
`104`	`104`	`cfg.train.train_iters,`
`105`	`105`	`ckpt_format=cfg.checkpoint.ckpt_format,`
`106`		`- thread_count=cfg.checkpoint.thread_count,`
	`106`	`+ storage_writers_per_rank=cfg.checkpoint.storage_writers_per_rank,`
`107`	`107`	`)`
`108`	`108`	`finally:`
`109`	`109`	`clear_directories(shared_dir)`