added torchao params as cli launch params

shimizust · shimizust · commit 937b6ea00782 · 2025-11-27T02:36:32.000Z
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -2042,7 +2042,6 @@ def _prepare_ao(self, *args):
         if (
             self.is_fsdp2
             and len(optimizers) > 0
-            and self.ao_recipe_handler is not None
             and self.ao_recipe_handler.config.enable_fsdp_float8_all_gather
         ):
             from torchao.float8 import precompute_float8_dynamic_scale_for_fsdp
diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
@@ -794,11 +794,11 @@ def get_cluster_input():
             )
             if mixed_precision == "fp8":
                 if not is_fp8_available():
-                    raise ValueError("FP8 (either Transformer Engine or MSAMP) is not installed on this machine.")
+                    raise ValueError("FP8 (either TorchAO, Transformer Engine or MSAMP) is not installed on this machine.")
                 fp8_config = {}
                 fp8_config["backend"] = _ask_options(
                     "Which FP8 backend do you want to use?",
-                    ["te", "msamp"],
+                    ["ao", "te", "msamp"],
                     _convert_fp8_backend,
                 )
                 if fp8_config["backend"] == "TE":
@@ -870,6 +870,20 @@ def get_cluster_input():
                         lambda x: "O1" if x == 0 else "O2",
                         default=1,
                     )
+                
+                elif fp8_config["backend"] == "AO":
+                    if not is_torch_ao_available():
+                        raise ValueError("TorchAO was selected, but it is not installed on this machine.")
+                    fp8_config["enable_fsdp_float8_all_gather"] = _ask_field(
+                        "Do you want to enable FSDP2 float8 all gather? This is recommended for better performance if using FSDP2. [YES/no]: ",
+                        _convert_yes_no_to_bool,
+                        default=True,
+                    )
+                    fp8_config["pad_inner_dim"] = _ask_field(
+                        "Do you want to pad the inner dimension of weight matrices to multiples of 16 before float8 matmuls? Required for _scaled_mm which has strict alignment requirements. Note: padding may cause memory spikes. [YES/no]: ",
+                        _convert_yes_no_to_bool,
+                        default=True,
+                    )
 
     if use_dynamo and mixed_precision == "no" and not use_cpu:
         print(
diff --git a/src/accelerate/commands/config/config_utils.py b/src/accelerate/commands/config/config_utils.py
@@ -104,7 +104,7 @@ def _convert_sagemaker_distributed_mode(value):
 
 def _convert_fp8_backend(value):
     value = int(value)
-    return FP8BackendType(["TE", "MSAMP"][value])
+    return FP8BackendType(["AO", "TE", "MSAMP"][value])
 
 
 def _convert_yes_no_to_bool(value):
diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -667,8 +667,8 @@ def launch_command_parser(subparsers=None):
     fp8_args.add_argument(
         "--fp8_backend",
         type=str,
-        choices=["te", "msamp"],
-        help="Choose a backend to train with FP8 (te: TransformerEngine, msamp: MS-AMP)",
+        choices=["ao", "te", "msamp"],
+        help="Choose a backend to train with FP8 (ao: TorchAO,te: TransformerEngine, msamp: MS-AMP)",
     )
     fp8_args.add_argument(
         "--fp8_use_autocast_during_eval",
@@ -721,6 +721,18 @@ def launch_command_parser(subparsers=None):
         choices=["O1", "O2"],
         help="What level of 8-bit collective communication should be used with MS-AMP (useful only when `--fp8_backend=msamp` is passed).",
     )
+    fp8_args.add_argument(
+        "--fp8_enable_fsdp_float8_all_gather",
+        default="true",
+        type=str_to_bool,
+        help="Whether to enable FSDP float8 all gather (useful only when `--fp8_backend=ao` is passed).",
+    )
+    fp8_args.add_argument(
+        "--fp8_pad_inner_dim",
+        default="true",
+        type=str_to_bool,
+        help="Whether to pad the inner dimension for FP8 GEMMs (useful only when `--fp8_backend=ao` is passed).",
+    )
 
     # AWS arguments
     aws_args = parser.add_argument_group("AWS Arguments", "Arguments related to AWS.")
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
@@ -335,7 +335,17 @@ def __post_init__(self):
         if self.config is None:
             from torchao.float8 import Float8LinearConfig
 
-            self.config = Float8LinearConfig(pad_inner_dim=True, enable_fsdp_float8_all_gather=True)
+            env_prefix = "ACCELERATE_FP8_"
+            # Check environment variables for overrides
+            pad_inner_dim = parse_flag_from_env(env_prefix + "PAD_INNER_DIM", default=True)
+            enable_fsdp_float8_all_gather = parse_flag_from_env(
+                env_prefix + "ENABLE_FSDP_FLOAT8_ALL_GATHER", default=True
+            )
+
+            self.config = Float8LinearConfig(
+                pad_inner_dim=pad_inner_dim,
+                enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
+            )
 
 
 @dataclass