Update FP8 bmm

andrea-fasoli · andrea-fasoli · commit 6f289b09861d · 2025-06-27T23:27:55.000Z
Signed-off-by: Andrea Fasoli &lt;andrea.fasoli@ibm.com&gt;
diff --git a/fms_mo/aiu_addons/fp8/fp8_bmm.py b/fms_mo/aiu_addons/fp8/fp8_bmm.py
@@ -14,7 +14,6 @@
 """FMS registration of attention BMM operation using torch-registered scaled BMM."""
 
 # Standard
-from importlib.util import find_spec
 from typing import NotRequired, Unpack
 import math
 
@@ -24,79 +23,44 @@
     _sdpa_update_attn_kwargs,
     register_attention_op,
 )
-from torch import Tensor
 import torch
 
 # Local
-import fms_mo.aiu_addons.fp8.fp8_aiu_op  # pylint: disable=unused-import
-
-if find_spec("torchao"):
-    TORCHAO_INSTALLED = True
-    # Third Party
-    from torchao.dtypes.affine_quantized_tensor import AffineQuantizedTensor
-    from torchao.dtypes.floatx.float8_layout import (
-        Float8AQTTensorImpl,
-        Float8Layout,
-        Float8MMConfig,
-    )
-    from torchao.quantization.granularity import PerTensor
-    from torchao.quantization.observer import get_block_size
-    from torchao.quantization.quant_primitives import ZeroPointDomain
-else:
-    TORCHAO_INSTALLED = False
+from fms_mo.aiu_addons.fp8.fp8_utils import ScaledTensor
+import fms_mo.aiu_addons.fp8.fp8_spyre_op  # pylint: disable=unused-import
 
 
 class MathFP8AttentionKwargs(AttentionKwargs):
     """TypedDict for FP8 attention."""
 
-    mask: NotRequired[Tensor]
+    mask: NotRequired[torch.Tensor]
     do_scale_q: bool
     is_causal_mask: bool
 
 
-# TODO: Doesn't quite work yet, more discussion needed
+# TODO: Figure out better scales for AIU? These come from vLLM
 Q_RANGE = 200.0
 K_RANGE = 200.0
 V_RANGE = 100.0
 
 
-def _construct_fp8_cache(
-    tensor: Tensor, scale: Tensor, orig_dtype: torch.dtype
-) -> AffineQuantizedTensor:
-    """Construct the torchao tensor to save kv cache with its scales."""
-
-    weight_granularity = PerTensor()
-    fp8_layout = Float8Layout(Float8MMConfig(use_fast_accum=True))
-    return AffineQuantizedTensor(
-        Float8AQTTensorImpl.from_plain(
-            tensor,
-            scale,
-            None,
-            fp8_layout,
-        ),
-        get_block_size(tensor.shape, weight_granularity),
-        tensor.shape,
-        zero_point_domain=ZeroPointDomain.NONE,
-        dtype=orig_dtype,
-    )
+def _construct_fp8_cache(tensor: torch.Tensor, scale: torch.Tensor) -> ScaledTensor:
+    """Construct the custom object to save KV cache with its scales."""
+    return ScaledTensor(tensor, scale)
 
 
 def _math_fp8_store_op(
-    keys: Tensor,  # pylint: disable=unused-argument
-    values: Tensor,
-    key_cache: Tensor | None,
-    value_cache: Tensor | None,
+    keys: torch.Tensor,  # pylint: disable=unused-argument
+    values: torch.Tensor,
+    key_cache: torch.Tensor | None,
+    value_cache: torch.Tensor | None,
     **attn_kwargs: Unpack[MathFP8AttentionKwargs],
-) -> tuple[Tensor, Tensor, Tensor, Tensor]:
+) -> tuple[ScaledTensor, ScaledTensor, ScaledTensor, ScaledTensor]:
     """Implement math of KV cache storing."""
 
-    orig_dtype = keys.dtype
-
-    if isinstance(key_cache, AffineQuantizedTensor) and isinstance(
-        value_cache, AffineQuantizedTensor
-    ):
-        k_scale = key_cache.tensor_impl.scale
-        v_scale = value_cache.tensor_impl.scale
+    if isinstance(key_cache, ScaledTensor) and isinstance(value_cache, ScaledTensor):
+        k_scale = key_cache._scale
+        v_scale = value_cache._scale
     else:
         k_scale = (torch.abs(keys).max() / K_RANGE).to(dtype=torch.float32)
         v_scale = (torch.abs(values).max() / V_RANGE).to(dtype=torch.float32)
@@ -105,36 +69,35 @@ def _math_fp8_store_op(
     values = (values / v_scale).to(torch.float8_e4m3fn).transpose(2, 1)
 
     if (
-        isinstance(key_cache, AffineQuantizedTensor)
-        and isinstance(value_cache, AffineQuantizedTensor)
+        isinstance(key_cache, ScaledTensor)
+        and isinstance(value_cache, ScaledTensor)
         and value_cache.numel() > 0
     ):
-        key_cache = torch.cat((key_cache.tensor_impl.float8_data, keys), dim=2)
-        value_cache = torch.cat((value_cache.tensor_impl.float8_data, values), dim=2)
-        key_cache = _construct_fp8_cache(key_cache, k_scale, orig_dtype)
-        value_cache = _construct_fp8_cache(value_cache, v_scale, orig_dtype)
+        key_cache = torch.cat((key_cache._data, keys), dim=2)
+        value_cache = torch.cat((value_cache._data, values), dim=2)
+        key_cache = _construct_fp8_cache(key_cache, k_scale)
+        value_cache = _construct_fp8_cache(value_cache, v_scale)
         return (
             key_cache,
             value_cache,
             key_cache,
             value_cache,
         )
-
-    keys = _construct_fp8_cache(keys, k_scale, orig_dtype)
-    values = _construct_fp8_cache(values, v_scale, orig_dtype)
+    keys = _construct_fp8_cache(keys.contiguous(), k_scale)
+    values = _construct_fp8_cache(values.contiguous(), v_scale)
     return (keys, values, keys, values)
 
 
 def _math_fp8_compute_op(
-    query: Tensor,
-    key_cache: Tensor,
-    value_cache: Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
     nheads: int,
     kvheads: int,
     p_dropout: float,
     scale_factor: float | None,
     **attn_kwargs: Unpack[MathFP8AttentionKwargs],
-) -> Tensor:
+) -> torch.Tensor:
     """Implement computation of attention BMM, leveraging the custom scaled attention
     BMM op that was pre-registered for torch.compile."""
 
@@ -147,13 +110,11 @@ def _math_fp8_compute_op(
 
     query = query.to(torch.float8_e4m3fn).transpose(2, 1)
 
-    if isinstance(key_cache, AffineQuantizedTensor) and isinstance(
-        value_cache, AffineQuantizedTensor
-    ):
-        k_scale = key_cache.tensor_impl.scale
-        v_scale = value_cache.tensor_impl.scale
-        key_cache = key_cache.tensor_impl.float8_data
-        value_cache = value_cache.tensor_impl.float8_data
+    if isinstance(key_cache, ScaledTensor) and isinstance(value_cache, ScaledTensor):
+        k_scale = key_cache._scale
+        v_scale = value_cache._scale
+        key_cache = key_cache._data
+        value_cache = value_cache._data
     else:
         k_scale = (torch.abs(key_cache).max() / K_RANGE).to(dtype=torch.float32)
         v_scale = (torch.abs(value_cache).max() / V_RANGE).to(dtype=torch.float32)
diff --git a/fms_mo/aiu_addons/fp8/fp8_linear.py b/fms_mo/aiu_addons/fp8/fp8_linear.py
diff --git a/fms_mo/aiu_addons/fp8/fp8_spyre_op.py b/fms_mo/aiu_addons/fp8/fp8_spyre_op.py
@@ -21,7 +21,7 @@
 # abstract op must be registered with specific I/O, even if not in use by the op function
 
 
-@torch.library.custom_op("sendnn::scaled_bmm", mutates_args=())
+@torch.library.custom_op("spyre::scaled_bmm", mutates_args=())
 def sendnn_scaled_bmm(
     mat1: Tensor,
     mat2: Tensor,
@@ -38,17 +38,8 @@ def sendnn_scaled_bmm(
     assert (
         mat1.shape[:-2] == mat2.shape[:-2]
     ), "batch dimensions must match for mat1 and mat2"
-    assert (
-        mat1.shape[:-2] == scale1.shape[:-2]
-    ), "batch dimensions must match for mat1 and scale1"
-    assert (
-        mat2.shape[:-2] == scale2.shape[:-2]
-    ), "batch dimensions must match for mat2 and scale2"
-
     mat1 = mat1.view(-1, *mat1.shape[-2:])
     mat2 = mat2.view(-1, *mat2.shape[-2:])
-    scale1 = scale1.view(-1, *scale1.shape[-2:])
-    scale2 = scale2.view(-1, *scale2.shape[-2:])
     out = torch.empty(
         (mat1.shape[0], mat1.shape[1], mat2.shape[2]),
         dtype=out_dtype,
@@ -58,12 +49,12 @@ def sendnn_scaled_bmm(
         out[b_idx] = torch._scaled_mm(
             mat1[b_idx],
             mat2[b_idx],
-            scale1[b_idx],
-            scale2[b_idx],
-            out_dtype,
-            use_fast_accum,
+            scale1,
+            scale2,
+            out_dtype=out_dtype,
+            use_fast_accum=use_fast_accum,
         )
-    return out
+    return out.view(*mat1.shape[:-2], mat1.shape[1], mat2.shape[2])
 
 
 @sendnn_scaled_bmm.register_fake
diff --git a/fms_mo/aiu_addons/fp8/fp8_utils.py b/fms_mo/aiu_addons/fp8/fp8_utils.py
@@ -0,0 +1,148 @@
+# Copyright The FMS Model Optimizer Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FMS registration of attention BMM operation using torch-registered scaled BMM."""
+
+# Standard
+import functools
+
+# Third Party
+import torch
+
+# pylint: disable=unused-argument
+# unusued arguments are needed for templates
+
+
+_HANDLED_FUNCTIONS = {}
+
+
+def _implements(torch_function):
+    """Register a torch function override"""
+
+    def decorator(func):
+        @functools.wraps(torch_function)
+        def wrapper(f, types, args, kwargs):
+            return func(f, types, args, kwargs)
+
+        _HANDLED_FUNCTIONS[torch_function] = wrapper
+        return func
+
+    return decorator
+
+
+class ScaledTensor(torch.Tensor):
+    """Representation of a quantized tensor and its scale."""
+
+    def __new__(
+        cls,
+        data: torch.Tensor,
+        scale: torch.Tensor,
+    ):
+        return torch.Tensor._make_wrapper_subclass(
+            cls,
+            data.size(),
+            strides=data.stride(),
+            storage_offset=data.storage_offset(),
+            dtype=data.dtype,
+            layout=data.layout,
+            requires_grad=data.requires_grad,
+            device=data.device,
+        )
+
+    def __init__(
+        self,
+        data: torch.Tensor,
+        scale: torch.Tensor,
+    ):
+        self._data = data
+        self._scale = scale
+
+    def __tensor_flatten__(self):
+        ctx = {}
+        return ["_data", "_scale"], ctx
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, metadata, outer_size, outer_stride):
+        assert len(inner_tensors) == 2
+        return ScaledTensor(
+            inner_tensors["_data"],
+            inner_tensors["_scale"],
+        )
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs=None):
+        if func in _HANDLED_FUNCTIONS:
+            return _HANDLED_FUNCTIONS[func](func, types, args, kwargs)
+
+        arg_types = tuple(type(arg) for arg in args)
+        kwarg_types = {k: type(arg) for k, arg in kwargs.items()}
+        raise NotImplementedError(
+            f"{cls.__name__} dispatch: attempting to run unimplemented "
+            f"operator/function: {func=}, {types=}, {arg_types=}, {kwarg_types=}"
+        )
+
+    def __repr__(self):
+        return f"{self._data.__repr__()}\n{self._scale.__repr__()}"
+
+
+def _infer_quantization_config(quant_config: dict) -> dict | None:
+    # There's many quantization packages compatible with HF
+    # We initially focus on llm-compressor as it is the one used in FMS-MO
+
+    # llm-compressor saves its checkpoints with quant_method = compressed-tensors
+    # quantization_status tells us whether the model has already been quantized
+    #   We only support loading already quantized models (compressed status)
+    if (
+        quant_config["quant_method"] == "compressed-tensors"
+        and quant_config["quantization_status"] == "compressed"
+    ):
+        # FP8 quantization will have FP8 weights
+        # We assume a single quantization group (group_0), to follow fms-mo checkpoints
+        # num_bits and type tells us "float" with "8" bits, aka FP8
+        if (
+            quant_config["config_groups"]["group_0"]["weights"]["type"] == "float"
+            and quant_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
+        ):
+            # This is used by get_linear to decide whether a linear layer
+            # will be quantized or not inside the model
+            def fp8_linear_type(name: str) -> str:
+                # We need to translate HF names to FMS names
+                translations = {
+                    "lm_head": "head",
+                }
+                for ignored_layer in quant_config["ignore"]:
+                    assert isinstance(ignored_layer, str)
+                    fms_ign_layer = translations.get(ignored_layer, ignored_layer)
+                    if name in fms_ign_layer:
+                        return "torch_linear"
+                for pattern in quant_config["config_groups"]["group_0"]["targets"]:
+                    # Special case from llm-compressor that covers all linear layers
+                    # not in the ignore pattern
+                    assert isinstance(pattern, str)
+                    if pattern == "Linear":
+                        return "fp8"
+                    if name in translations.get(pattern, pattern):
+                        return "fp8"
+                return "torch_linear"
+
+            return {
+                "linear_type": fp8_linear_type,
+                "input_activations": quant_config["config_groups"]["group_0"][
+                    "input_activations"
+                ],
+                "output_activations": quant_config["config_groups"]["group_0"][
+                    "output_activations"
+                ],
+                "weights": quant_config["config_groups"]["group_0"]["weights"],
+            }
+    return None