Allow mixed-precision training in DDP/single-device scenarios (#36)

mutiann · web-flow · commit 24b29805db77 · 2025-08-17T21:52:11.000+08:00
* Support more recent torchtitan version up to commit 0b44d4c, and allow AMP training in non-FSDP mode (as in pytorch/torchtitan@0b44d4c#diff-54e6f3c870acaf438db326aba3c3462b1848b4600cc37204de946da020805dd3) * Fixing --checkpoint.initial_load_model_weights_only
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ pip uninstall flash-linear-attention && pip install -U --no-use-pep517 git+https
 
 [Important] Install specific version of torchtitan
 ```
-pip install git+https://github.com/pytorch/torchtitan.git@5e2033c
+pip install git+https://github.com/pytorch/torchtitan.git@0b44d4c
 ```
 
 
diff --git a/flame/config_manager.py b/flame/config_manager.py
@@ -183,6 +183,18 @@ def __init__(self):
         self.parser.add_argument(
             "--optimizer.lr", type=float, default=8e-4, help="Learning rate to use"
         )
+        self.parser.add_argument(
+            "--optimizer.beta1", type=float, default=0.9,
+            help="Exponential moving average hyperparameters to use"
+        )
+        self.parser.add_argument(
+            "--optimizer.beta2", type=float, default=0.95,
+            help="Exponential moving average hyperparameters to use"
+        )
+        self.parser.add_argument(
+            "--optimizer.weight_decay", type=float, default=0.1,
+            help="Weight decay to use"
+        )
         self.parser.add_argument(
             "--optimizer.implementation",
             type=str,
@@ -407,8 +419,10 @@ def __init__(self):
             default="bfloat16",
             choices=["bfloat16", "float32"],
             help="""
-                torch dtype to use for parameters when applying mixed precision via FSDP.
-                This feature only takes effect when data_parallel_shard_degree > 1
+                torch dtype to use for parameters when applying mixed precision via fully_shard or torch.autocast.
+                This feature takes effect via fully_shard when data_parallel_shard_degree > 1 or
+                context_parallel_degree > 1; it takes effect via torch.autocast when data_replicate_degree >= 1
+                and no other parallelism is enabled, i.e. under DDP or single-device training.
             """,
         )
         self.parser.add_argument(
@@ -606,19 +620,54 @@ def __init__(self):
                 When enable_checkpoint is set to true, checkpoints will be in {--job.dump_folder}/{--checkpoint.folder}.
             """,
         )
+        self.parser.add_argument(
+            "--checkpoint.initial_load_path", type=str, default=None,
+            help="""
+                This option specifies the path to the initial checkpoint to load, which is
+                particularly useful for resuming training from a previous run with a
+                different output path or when loading a checkpoint from a pre-trained model.
+                If the checkpoint folder for the current run is not empty,
+                located at {--job.dump_folder}/{--checkpoint.folder}, this option will be ignored.
+                This feature allows users to load an initial checkpoint from a different folder and
+                continue training, saving new checkpoints to the specified folder without affecting
+                the existing ones.
+            
+                Note that the path should contain the full path to the checkpoint folder,
+                including the step number, if any; for example,
+                "//pre_train/checkpoints/llama3/llama3_8b/step_10000".
+                """
+        )
+        self.parser.add_argument(
+            "--checkpoint.initial_load_model_weights_only",
+            dest='checkpoint.initial_load_model_weights_only', action="store_true", default=True,
+            help="""
+                This option specifies if only the model weights should be loaded during the initial
+                checkpoint load. The option is only used when `initial_load_path` is specified, and
+                only applies to a model_weights_only checkpoint. Loading a periodic checkpoint 
+                may lead to unexpected behavior if this option is set to True.
+                If False, the checkpoint at `initial_load_path` is treated as a standard training
+                checkpoint, including optimizer and training states.
+                The default setting for this option is True. Note that you will have to use
+                `--checkpoint.no_initial_load_model_weights_only` to override the default setting.
+            """
+        )
+        self.parser.add_argument(
+            "--checkpoint.no_initial_load_model_weights_only",
+            dest='checkpoint.initial_load_model_weights_only', action="store_false",
+        )
         self.parser.add_argument(
             "--checkpoint.interval",
             type=int,
             default=500,
             help="Checkpointing interval in steps.",
         )
         self.parser.add_argument(
-            "--checkpoint.model_weights_only",
+            "--checkpoint.last_save_model_weights_only",
             action="store_true",
             help="""
-                When model_weights_only=True, only model weights will be saved at the end of training.
-                With this, checkpoints can be loaded using `torch.load(..., weights_only=True)` after conversion.
-                When model_weights_only=False, the full checkpoint will be saved.
+                When last_save_model_weights_only=True, only model weights will be saved at the end of training,
+                the last save.  With this, checkpoints can be loaded using `torch.load(..., weights_only=True)`
+                after conversion.  When last_save_model_weights_only=False, the full checkpoint will be saved.
                 A full checkpoint includes model, optimizer and train_state, which can be used to resume training.
                 The default value is false.
             """,
diff --git a/flame/train.py b/flame/train.py
@@ -350,6 +350,11 @@ def main(job_config: JobConfig):
         parallel_dims.loss_parallel_enabled,
         job_config.experimental.enable_compiled_autograd,
     )
+    maybe_enable_amp = dist_utils.maybe_enable_amp(
+        parallel_dims,
+        job_config.training.mixed_precision_param,
+        device_type,
+    )
 
     # variables used to keep info for metrics logging
     device_memory_monitor.reset_peak_stats()
@@ -484,11 +489,12 @@ def main(job_config: JobConfig):
                 else:
                     # Non-PP forward / backward
                     with train_context(optional_context_parallel_ctx):
-                        output = model(
-                            input_ids=input_ids,
-                            labels=labels,
-                            position_ids=position_ids,
-                            cu_seqlens=cu_seqlens,
+                        with maybe_enable_amp:
+                            output = model(
+                                input_ids=input_ids,
+                                labels=labels,
+                                position_ids=position_ids,
+                                cu_seqlens=cu_seqlens,
                         )
                         loss = (
                             output.loss