Merge branch 'main' into fast_loading

bayo-ibm · bayo-ibm · commit e9874ef19b81 · 2025-08-19T22:34:47.000-04:00
diff --git a/fms_mo/aiu_addons/__init__.py b/fms_mo/aiu_addons/__init__.py
@@ -1,3 +1,7 @@
+# Local
+from fms_mo.prep import available_packages
+
+
 def _infer_quantization_config(quant_config: dict) -> dict | None:
     """Construct linear_config dictionary carrying FP8 configuration for FMS.
 
@@ -20,9 +24,14 @@ def _infer_quantization_config(quant_config: dict) -> dict | None:
             quant_config["config_groups"]["group_0"]["weights"]["type"] == "float"
             and quant_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
         ):
+            if not available_packages["torchao"]:
+                raise ImportError(
+                    "You need torchao installed to load FP8 checkpoints in FMS"
+                )
             # First, import required FP8 linear classes from fms-mo
             # Local
             import fms_mo.aiu_addons.fp8.fp8_adapter  # pylint: disable=unused-import
+            import fms_mo.aiu_addons.fp8.fp8_attn  # pylint: disable=unused-import
             import fms_mo.aiu_addons.fp8.fp8_linear  # pylint: disable=unused-import
 
             # This is used by get_linear to decide whether a linear layer
diff --git a/fms_mo/aiu_addons/fp8/fp8_attn.py b/fms_mo/aiu_addons/fp8/fp8_attn.py
@@ -29,6 +29,7 @@
     # Third Party
     from fms.modules.attention import (
         AttentionKwargs,
+        _sdpa_compute_op,
         _sdpa_update_attn_kwargs,
         register_attention_op,
     )
@@ -219,8 +220,9 @@ def _math_fp8_compute_op(
                 .to(dtype=orig_dtype)
                 .transpose(-2, -1)
             )
-            attn_weight = query @ key_t
-            attn_weight *= scale_factor
+            attn_weight = (query * math.sqrt(scale_factor)) @ (
+                key_t * math.sqrt(scale_factor)
+            )
         attn_weight += attn_bias
         attn_weight = torch.softmax(attn_weight, dim=-1)
         attn_weight = torch.dropout(attn_weight, p_dropout, train=True)
@@ -340,7 +342,7 @@ def __spyre_scaled_paged_validate_attn_kwargs_op(
     register_attention_op(
         "spyre_paged_attn_fp8",
         _spyre_scaled_paged_store_op,
-        compute_op=_math_fp8_compute_op,
+        compute_op=_sdpa_compute_op,
         is_prefill_op=lambda **attn_kwargs: attn_kwargs.get("block_table", None)
         is None,
         compute_decode_op=_spyre_scaled_paged_compute_op,
diff --git a/fms_mo/aiu_addons/fp8/fp8_linear.py b/fms_mo/aiu_addons/fp8/fp8_linear.py
@@ -321,12 +321,12 @@ def shard_fp8_linear(
         sharding  | param          | shard | dim |
         ----------+----------------+-------+-----|
         colwise   | weight         |   Y   |  0  |
-                  | weight_scale   |   N   |  -  |
+                  | weight_scale   |  Y/N  | 0/- |
                   | input_scale    |   N   |  -  |
                   | bias           |   Y   |  0  |
         ----------+----------------+-------+-----|
         rowwise   | weight         |   Y   |  1  |
-                  | weight_scale   |  Y/N  | 0/- |
+                  | weight_scale   |   N   |  -  |
                   | input_scale    |  Y/N  | 0/- |
                   | bias           |   0   |  -  |
         """
@@ -339,7 +339,7 @@ def shard_fp8_linear(
             ]
             # Scales are per-row or per-tensor
             # Only sharding needed when row parallel and per-row
-            shard_scales = weight_strategy != "tensor" and module_info.sharding_dim == 1
+            shard_scales = weight_strategy != "tensor" and module_info.sharding_dim == 0
             params: dict[str, LinearParameterShardingInfo] = {
                 "weight": LinearParameterShardingInfo(
                     module_info.sharding_dim, ShardType.SHARD
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,7 @@ dynamic = ["version"]
 dependencies = [
 "numpy>=1.26.4,<2.3.0",
 "accelerate>=0.20.3,!=0.34,<1.10",
-"transformers>=4.45,<4.54",
+"transformers>=4.45,<4.56",
 "torch>=2.2.0,<2.8", 
 "tqdm>=4.66.2,<5.0",
 "datasets>=3.0.0,<5.0",
@@ -35,14 +35,14 @@ dependencies = [
 
 [project.optional-dependencies]
 examples = ["ninja>=1.11.1.1,<2.0", "evaluate", "huggingface_hub"]
-fp8 = ["llmcompressor", "torchao>=0.11,<=0.12"]
+fp8 = ["llmcompressor", "torchao==0.11"]
 gptq = ["Cython", "gptqmodel>=1.7.3"]
 mx = ["microxcaling>=1.1"]
 opt = ["fms-model-optimizer[fp8, gptq, mx]"]
 aiu = ["ibm-fms>=0.0.8"]
 torchvision = ["torchvision>=0.17"]
 flash-attn = ["flash-attn>=2.5.3,<3.0"]
-triton = ["triton>=3.0,<3.4"]
+triton = ["triton>=3.0,<3.5"]
 visualize = ["matplotlib", "graphviz", "pygraphviz", "tensorboard", "notebook"]
 dev = ["pre-commit>=3.0.4,<5.0"]
 test = ["pytest", "pillow"]