Merge branch 'main' into data-fixes

daviswer · web-flow · commit 30320355de5f · 2025-05-29T15:44:12.000-04:00
diff --git a/code-of-conduct.md b/code-of-conduct.md
@@ -0,0 +1,3 @@
+# Foundation Model Stack Community Code of Conduct
+
+Please refer to [Foundation Model Stack Community Code of Conduct](https://github.com/foundation-model-stack/foundation-model-stack/blob/main/code-of-conduct.md).
diff --git a/fms_fsdp/utils/train_utils.py b/fms_fsdp/utils/train_utils.py
@@ -166,7 +166,7 @@ def train(
             ddp_stats.zero_()
         torch.cuda.reset_peak_memory_stats(device=torch.cuda.current_device())
 
-        if batch_idx % cfg.checkpoint_interval == 0:
+        if batch_idx % cfg.checkpoint_interval == 0 or batch_idx == cfg.num_steps:
             checkpointer.save(
                 batch_idx,
                 model,
diff --git a/main_training_llama.py b/main_training_llama.py
@@ -169,8 +169,6 @@ def main(**kwargs):
         tokens_seen,
     )
 
-    checkpointer.save_single_file(cfg.num_steps, model)
-
     dist.barrier()
     dist.destroy_process_group()
 
diff --git a/main_training_mamba.py b/main_training_mamba.py
@@ -169,8 +169,6 @@ def main(**kwargs):
         tokens_seen,
     )
 
-    checkpointer.save_single_file(cfg.num_steps, model)
-
     dist.barrier()
     dist.destroy_process_group()
 
diff --git a/speculator/train_speculator_utils.py b/speculator/train_speculator_utils.py
@@ -412,6 +412,7 @@ def train_speculator(
 
         if (
             batch_idx % cfg.checkpoint_interval == 0
+            or batch_idx == cfg.num_steps
             or do_ckpt(cfg.ckpt_save_path) is True
         ):
             torch.cuda.empty_cache()
@@ -425,13 +426,6 @@ def train_speculator(
             torch.cuda.empty_cache()
             do_ckpt(cfg.ckpt_save_path, reset=True)
 
-    checkpointer.save_single_file(
-        batch_idx,
-        speculator,
-        tokens_seen=elapsed_tokens + n_tok,
-        is_compiled=cfg.use_torch_compile,
-    )
-
 
 class EmbedGPTBigCode(GPTBigCode):
     # Overrides the forward function of GPTBigCode to allow returning embedding vectors

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Foundation Model Stack Community Code of Conduct`
	`2`	`+`
	`3`	`+Please refer to [Foundation Model Stack Community Code of Conduct](https://github.com/foundation-model-stack/foundation-model-stack/blob/main/code-of-conduct.md).`
Original file line number	Diff line number	Diff line change
`@@ -169,8 +169,6 @@ def main(**kwargs):`
`169`	`169`	`tokens_seen,`
`170`	`170`	`)`
`171`	`171`
`172`		`- checkpointer.save_single_file(cfg.num_steps, model)`
`173`		`-`
`174`	`172`	`dist.barrier()`
`175`	`173`	`dist.destroy_process_group()`
`176`	`174`