rot impl

chichun-charlie-liu · chichun-charlie-liu · commit 13ebd9028f46 · 2025-04-29T19:34:39.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
@@ -40,6 +40,9 @@
 import torch.nn as nn  # pylint: disable=consider-using-from-import
 import torch.nn.functional as F
 
+# Local
+from fms_mo.quant.rotation import RotQuantWrapper
+
 logger = logging.getLogger(__name__)
 
 
@@ -66,8 +69,16 @@ def get_activation_quantizer(
     - pact/pact+/pactsym
     - sawb/sawb+
     - max
+
+    If qa_mode has "rot_" prefix or "_rot" suffix, wrap it with RotQuantizer(), remember to set up
+    R_left, R_right tensors later.
     """
 
+    use_rot = False
+    if "rot_" in qa_mode or "_rot" in qa_mode:
+        use_rot = True
+        qa_mode.replace("rot_", "").replace("_rot", "")
+
     if not use_swcap:
         QPACTLUT = {
             "pact_uni": PACT,
@@ -220,6 +231,9 @@ def get_activation_quantizer(
                 f"activation quantization mode {qa_mode} is incompatible with swcap"
             )
 
+    if use_rot:
+        act_quantizer = RotQuantWrapper(act_quantizer)
+
     return act_quantizer
 
 
@@ -245,7 +259,15 @@ def get_weight_quantizer(
     SWCAP quantizers:
     - sawb/sawb+
     - max
+    If qa_mode has "rot_" prefix or "_rot" suffix, wrap it with RotQuantizer(), remember to set up
+    R_left, R_right tensors later.
     """
+
+    use_rot = False
+    if "rot_" in qw_mode or "_rot" in qw_mode:
+        use_rot = True
+        qw_mode.replace("rot_", "").replace("_rot", "")
+
     weight_quantizer = None
     if not use_swcap:
         cggrad = "cgpact" in qw_mode
@@ -367,6 +389,9 @@ def get_weight_quantizer(
                 f"activation quantized mode {qw_mode} is incompatible with swcap"
             )
 
+    if use_rot:
+        weight_quantizer = RotQuantWrapper(weight_quantizer)
+
     return weight_quantizer
 
 
diff --git a/fms_mo/quant/rotation.py b/fms_mo/quant/rotation.py
@@ -0,0 +1,79 @@
+# Copyright The FMS Model Optimizer Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Util functions related to Hadamard rotation."""
+
+# Third Party
+import torch
+
+# Local
+from fms_mo.utils.hadamard_util import matmul_hadU_cuda
+
+
+class RotQuantWrapper(torch.nn.Module):
+    """Add a wrapper to fms-mo quantizers. Objects of this class could have two rotation tensors,
+    and basic formula is:
+
+        self.quantizer(self.rot_left @ input_tensor @ self.rot_right)
+
+    NOTE rot_xxx could be optional, depending on whether it's for weights or activations.
+    For example, in SpinQuant QKV Linears will looks like (pseudo-code, "self" are not refering
+    to the same objects here):
+        qx = self.quantize_feature(x)                       # no rotation, just a normal quantizer
+        qw_q = self.quantize_weight(self.weight, R1_t)      # need left rotation only
+        qw_k = self.quantize_weight(sefl.weight, R1_t)
+        qw_v = self.quantize_weight(sefl.weight, R1_t, R2)  # need both left and right rotation
+
+        return F.linear(qx, qw, bias)
+
+    for MLP down_proj
+        qx = self.quantize_feature(x, None, R4)             # for activation, should be x @ R
+        qw = self.quantize_weight(sefl.weight, R4_t, R1)
+
+        return F.linear(qx, qw, bias)
+
+    Also need to make sure self.R is pointing to a nn.Parameter() if training on R is needed.
+    """
+
+    def __init__(self, quantizer, *args, **kwargs):
+        self.online_full_had = kwargs.pop("online_full_had", None)
+        self.f32_had = kwargs.pop("f32_had", None)
+        super().__init__(*args, **kwargs)
+        self.quantizer = quantizer
+        self.R_left = None
+        self.R_right = None
+        self.K_left = None  # if K_xxx > 1, R_xxx is a special had matrix
+        self.K_right = None
+
+    def forward(self, input_tensor):
+        org_dtype = input_tensor.dtype
+
+        if self.online_full_had:
+            # online hadamard => rotation for activation. should be input_tensor @ R_right
+            # cannot be fused into W and no training, either.
+            if self.fp32_had:
+                input_tensor = input_tensor.float()
+            input_tensor = matmul_hadU_cuda(
+                input_tensor, self.R_right, self.K_right
+            ).to(org_dtype)
+
+            return input_tensor
+
+        # not online => rotation for weights, could be fused into W later.
+        if self.R_left:
+            input_tensor = self.R_left @ inp_tensor
+        if self.R_right:
+            inp_tensor = inp_tensor @ self.R_right
+
+        return inp_tensor
diff --git a/fms_mo/utils/hadamard_util.py b/fms_mo/utils/hadamard_util.py
@@ -0,0 +1,171 @@
+# Copyright The FMS Model Optimizer Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on QuaRot(https://github.com/spcl/QuaRot/tree/main/quarot).
+# Licensed under Apache License 2.0.
+# Adapted from https://github.com/Cornell-RelaxML/quip-sharp/blob/main/lib/utils/matmul_had.py
+# and https://github.com/facebookresearch/SpinQuant/blob/main/utils/hadamard_utils.py
+"""
+Change original "text tensor implementation" into binaries for better efficiency. Only has 12
+sizes available in the safetensors file. [12, 20, 28, 36, 40, 44, 52, 60, 108, 140, 156, 172]
+"""
+
+# Third Party
+from fast_hadamard_transform import hadamard_transform
+from safetensors import safe_open
+import torch
+
+
+class HadamardTransform(torch.autograd.Function):
+    """The unnormalized Hadamard transform (i.e. without dividing by sqrt(2))"""
+
+    # TODO seems redundant, insdie hadamard_transform(), backward is already handled...?
+    @staticmethod
+    def forward(ctx, u):
+        return hadamard_transform(u)
+
+    @staticmethod
+    def backward(ctx, grad):
+        return hadamard_transform(grad)
+
+
+def get_hadK(n, transpose=False):
+    """Simplify the implementation and use binary tensors instead of text implementation."""
+    for K in [172, 156, 140, 108, 60, 52, 44, 40, 36, 28, 20, 12]:
+        if n % K == 0 and is_pow2(n // K):
+            with safe_open("hadk.safetensors", framework="pt") as f:
+                assert (
+                    str(K) in f.keys()
+                ), f"Special size Hadamard {K} does not exist in the file."
+                hadK = f.get_tensor(str(K))
+
+            if transpose:
+                hadK = hadK.T
+
+            break
+
+    if hadK is None:
+        if is_pow2(n):
+            K = 1
+        else:
+            raise RuntimeError(
+                f"{n} is not power of 2 or does not have a special size Hadamard available."
+            )
+
+    return hadK, K
+
+
+def matmul_hadU(X, transpose=False):
+    n = X.shape[-1]
+    hadK, K = get_hadK(n, transpose)
+    input = X.clone().view(-1, n, 1)
+    output = input.clone()
+    while input.shape[1] > K:
+        input = input.view(input.shape[0], input.shape[1] // 2, 2, input.shape[2])
+        output = output.view(input.shape)
+        output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :]
+        output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :]
+        output = output.view(input.shape[0], input.shape[1], -1)
+        (input, output) = (output, input)
+    del output
+
+    if K > 1:
+        # Do not explicitly repeat - OOM
+        # input = torch.bmm(
+        #     hadK.repeat(len(input), 1, 1).to(input.device).to(input.dtype), input)
+        # Use bcast instead
+        input = hadK.view(1, K, K).to(input) @ input
+
+    return input.view(X.shape) / torch.tensor(n).sqrt()
+
+
+def matmul_hadUt(X):
+    return matmul_hadU(X, transpose=True)
+
+
+def random_hadamard_matrix(size, device):
+    # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation"
+    Q = torch.randint(low=0, high=2, size=(size,)).to(torch.float64)
+    Q = Q * 2 - 1
+    Q = torch.diag(Q)
+    return matmul_hadU(Q).to(device)
+
+
+def hadamard_matrix(size, device):
+    # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation"
+    Q = torch.eye(size)
+    return matmul_hadU(Q).to(device)
+
+
+def matmul_hadU_cuda(X, hadK, K):
+    n = X.shape[-1]
+    if K == 1:
+        return HadamardTransform.apply(X.contiguous()) / torch.tensor(n).sqrt()
+    # if transpose:
+    #     hadK = hadK.T.contiguous()
+    input = X.view(-1, K, n // K)
+    input = HadamardTransform.apply(input.contiguous()) / torch.tensor(n).sqrt()
+    input = hadK.to(input.device).to(input.dtype) @ input
+    return input.reshape(X.shape)
+
+
+def matmul_hadUt_cuda(X, hadK, K):
+    return matmul_hadU_cuda(X, hadK, K, transpose=True)
+
+
+def apply_exact_had_to_linear(module, had_dim=-1, output=False, R2=None):
+    assert isinstance(module, torch.nn.Linear)
+    in_features, out_features = module.in_features, module.out_features
+
+    if had_dim != -1:
+        assert is_pow2(had_dim), "Hadamard dimension must be a power of 2!"
+
+    W_ = module.weight.data
+    dtype = W_.dtype
+    dev = W_.device
+    init_shape = W_.shape
+    W_ = W_.float().cuda()
+
+    if had_dim == -1:
+        if output:
+            had_K, K = get_hadK(out_features)
+            W_ = matmul_hadU_cuda(W_.t(), had_K, K).t()
+        if not output:
+            had_K, K = get_hadK(in_features)
+            W_ = matmul_hadU_cuda(W_, had_K, K)
+    else:
+        hadK = hadamard_matrix(had_dim, "cuda").to(torch.float64)
+        if R2 is not None:
+            hadK = R2.to(torch.float64)
+        if output:
+            W_ = W_.t()
+            transposed_shape = W_.shape
+            temp = W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim)
+            temp = temp.to(torch.float64) @ hadK
+            W_ = temp.reshape(transposed_shape).t()
+        else:
+            init_shape = W_.shape
+            temp = W_.reshape(-1, init_shape[-1] // had_dim, had_dim)
+            temp = temp.to(torch.float64) @ hadK
+            W_ = temp.reshape(init_shape)
+    module.weight.data = W_.to(device=dev, dtype=dtype)
+
+
+def is_pow2(n):
+    return (n & (n - 1) == 0) and (n > 0)
+
+
+# hadamard matrices for had12, had36.pal2, had52,will,
+# # had60.pal, had108.pal, had140.pal, had156.will, had172.will:
+# http://www.neilsloane.com/hadamard/index.html
diff --git a/fms_mo/utils/hadk.safetensors b/fms_mo/utils/hadk.safetensors