Add New Models and Fix MHA Problems

federicobrancasi · federicobrancasi · commit 2f00a679e4b6 · 2025-06-24T02:38:22.000+02:00
diff --git a/DeepQuant/CustomForwards/MultiHeadAttention.py b/DeepQuant/CustomForwards/MultiHeadAttention.py
@@ -5,38 +5,70 @@
 # Federico Brancasi <fbrancasi@ethz.ch>
 
 import math
+from typing import Optional, Tuple
 
 import torch
 import torch.nn.functional as F
 from brevitas.nn.quant_mha import QuantMultiheadAttention
 from torch import Tensor
 
 
-def mhaForward(
-    self: QuantMultiheadAttention, query: Tensor, key: Tensor, value: Tensor
+def _mhaForwardImpl(
+    self: QuantMultiheadAttention,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    need_transpose_in: bool,
+    need_transpose_out: bool,
 ) -> Tensor:
-    """Explicit, export-friendly MHA forward."""
-    qOut = self.q_proj(query)
-    kOut = self.k_proj(key)
-    vOut = self.v_proj(value)
+    """Core MHA forward implementation."""
+    # FBRANCASI: Handle batch_first by transposing if needed
+    if need_transpose_in:
+        if key is value:
+            if query is key:
+                query = key = value = query.transpose(1, 0)
+            else:
+                query, key = [x.transpose(1, 0) for x in (query, key)]
+                value = key
+        else:
+            query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
+
+    if self.in_proj is not None:
+        # FBRANCASI: Handle packed projections (default case for models like ViT)
+        # Only support self-attention where query == key == value
+        if not (query is key and key is value):
+            raise RuntimeError(
+                "Packed in_proj is supported only for self-attention with k is v is q. Set packed_in_proj=False."
+            )
+        qkv = self.in_proj(query)
+        qkv_tensor = qkv.value if hasattr(qkv, "value") else qkv
+        qOut, kOut, vOut = qkv_tensor.chunk(3, dim=-1)
+    else:
+        q_result = self.q_proj(query)
+        k_result = self.k_proj(key)
+        v_result = self.v_proj(value)
+
+        qOut = q_result.value if hasattr(q_result, "value") else q_result
+        kOut = k_result.value if hasattr(k_result, "value") else k_result
+        vOut = v_result.value if hasattr(v_result, "value") else v_result
 
     seqLen, batchSize, embedDim = qOut.shape
     headDim = embedDim // self.num_heads
 
     qOut = (
-        qOut.view(seqLen, batchSize, self.num_heads, headDim)
-        .permute(1, 2, 0, 3)
-        .reshape(batchSize * self.num_heads, seqLen, headDim)
+        qOut.contiguous()
+        .view(seqLen, batchSize * self.num_heads, headDim)
+        .transpose(0, 1)
     )
     kOut = (
-        kOut.view(seqLen, batchSize, self.num_heads, headDim)
-        .permute(1, 2, 0, 3)
-        .reshape(batchSize * self.num_heads, seqLen, headDim)
+        kOut.contiguous()
+        .view(seqLen, batchSize * self.num_heads, headDim)
+        .transpose(0, 1)
     )
     vOut = (
-        vOut.view(seqLen, batchSize, self.num_heads, headDim)
-        .permute(1, 2, 0, 3)
-        .reshape(batchSize * self.num_heads, seqLen, headDim)
+        vOut.contiguous()
+        .view(seqLen, batchSize * self.num_heads, headDim)
+        .transpose(0, 1)
     )
 
     qScaled = qOut / math.sqrt(headDim)
@@ -54,10 +86,65 @@ def mhaForward(
     attnOutput = torch.bmm(attnWeights, vOut)
 
     attnOutput = (
-        attnOutput.view(batchSize, self.num_heads, seqLen, headDim)
-        .permute(2, 0, 1, 3)
-        .reshape(seqLen, batchSize, embedDim)
+        attnOutput.transpose(0, 1).contiguous().view(seqLen, batchSize, embedDim)
     )
 
-    attnOutput = self.out_proj(attnOutput)
+    out_result = self.out_proj(attnOutput)
+    attnOutput = out_result.value if hasattr(out_result, "value") else out_result
+
+    if need_transpose_out:
+        attnOutput = attnOutput.transpose(1, 0)
+
     return attnOutput
+
+
+def mhaForwardBatchFirst(
+    self: QuantMultiheadAttention,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    need_weights: bool = True,
+    **kwargs,
+) -> Tuple[Tensor, Optional[Tensor]]:
+    """MHA forward for batch_first=True."""
+    attn_output = _mhaForwardImpl(
+        self, query, key, value, need_transpose_in=True, need_transpose_out=True
+    )
+    # PyTorch always returns a tuple, even when need_weights=False
+    return (attn_output, None)
+
+
+def mhaForwardSeqFirst(
+    self: QuantMultiheadAttention,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    need_weights: bool = True,
+    **kwargs,
+) -> Tuple[Tensor, Optional[Tensor]]:
+    """MHA forward for batch_first=False."""
+    attn_output = _mhaForwardImpl(
+        self, query, key, value, need_transpose_in=False, need_transpose_out=False
+    )
+    # PyTorch always returns a tuple, even when need_weights=False
+    return (attn_output, None)
+
+
+def mhaForward(
+    self: QuantMultiheadAttention,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    need_weights: bool = True,
+    **kwargs,
+) -> Tuple[Tensor, Optional[Tensor]]:
+    """Explicit, export-friendly MHA forward.
+
+    This function will be replaced with the appropriate batch_first or seq_first version
+    during module transformation based on the module's batch_first attribute.
+    """
+    # FBRANCASI: Appropriate version before tracing
+    if self.batch_first:
+        return mhaForwardBatchFirst(self, query, key, value, need_weights, **kwargs)
+    else:
+        return mhaForwardSeqFirst(self, query, key, value, need_weights, **kwargs)
diff --git a/DeepQuant/Pipeline/DequantUnify.py b/DeepQuant/Pipeline/DequantUnify.py
@@ -80,7 +80,10 @@ def mergeDequants(
 
     # FBRANCASI: Check output equivalence with a warning instead of error
     if checkEquivalence:
-        if not torch.allclose(referenceOutput, output, atol=1e-5) and debug:
+        # FBRANCASI: Handle case where output/referenceOutput might be tuples
+        refToCompare = referenceOutput[0] if isinstance(referenceOutput, tuple) else referenceOutput
+        outToCompare = output[0] if isinstance(output, tuple) else output
+        if not torch.allclose(refToCompare, outToCompare, atol=1e-5) and debug:
             print(
                 cc.warning(
                     "Modification of Dequant Nodes may have changed the output slightly"
diff --git a/DeepQuant/Pipeline/Injection.py b/DeepQuant/Pipeline/Injection.py
@@ -51,7 +51,9 @@ def injectCustomForwards(
         output = fxModel(exampleInput)
 
     if checkEquivalence:
-        if torch.allclose(referenceOutput, output, atol=1e-5):
+        # Handle case where output might be a tuple (e.g., from MHA)
+        outputToCompare = output[0] if isinstance(output, tuple) else output
+        if torch.allclose(referenceOutput, outputToCompare, atol=1e-5):
             if debug:
                 print(cc.success("Injection of New Modules: output is consistent"))
         else:
diff --git a/DeepQuant/Pipeline/QuantSplit.py b/DeepQuant/Pipeline/QuantSplit.py
@@ -46,7 +46,10 @@ def splitQuantNodes(
         output = splitModel(exampleInput)
 
     if checkEquivalence:
-        if torch.allclose(referenceOutput, output, atol=1e-5):
+        # FBRANCASI: Handle case where output/referenceOutput might be tuples
+        refToCompare = referenceOutput[0] if isinstance(referenceOutput, tuple) else referenceOutput
+        outToCompare = output[0] if isinstance(output, tuple) else output
+        if torch.allclose(refToCompare, outToCompare, atol=1e-5):
             if debug:
                 print(cc.success("Split of Quant Nodes: output is consistent"))
         else:
diff --git a/DeepQuant/QuantManipulation/QuantNodesDivider.py b/DeepQuant/QuantManipulation/QuantNodesDivider.py
@@ -110,6 +110,18 @@ def convertQuantOperations(
                         newCatArgs[0] = updatedTensors
                         userNode.args = tuple(newCatArgs)
                         usersUpdated = True
+                elif (
+                    userNode.op == "call_function"
+                    and userNode.target == getattr
+                    and len(userNode.args) >= 2
+                    and userNode.args[0] is node
+                    and userNode.args[1] == "value"
+                ):
+                    # FBRANCASI: Special handling for .value access on dequant output
+                    # Replace getattr(dequant_node, 'value') with just dequant_node
+                    userNode.replace_all_uses_with(dequantNode)
+                    nodesToRemove.append(userNode)
+                    usersUpdated = True
                 else:
                     # FBRANCASI: Standard node reference replacement
                     newArgs = []
diff --git a/DeepQuant/Transforms/Transformations.py b/DeepQuant/Transforms/Transformations.py
@@ -14,8 +14,11 @@
 from brevitas.nn.quant_mha import QuantMultiheadAttention
 
 from DeepQuant.CustomForwards.Activations import WrapperActivation, activationForward
+from DeepQuant.CustomForwards.MultiHeadAttention import (
+    mhaForwardBatchFirst,
+    mhaForwardSeqFirst,
+)
 from DeepQuant.CustomForwards.WBIOL import WBIOLForward, WrapperWBIOL
-from DeepQuant.CustomForwards.MultiHeadAttention import mhaForward
 from DeepQuant.Transforms.Base import TransformationPass
 from DeepQuant.Utils.CustomTracer import QuantTracer
 
@@ -82,7 +85,11 @@ def injectForward(
         self, module: nn.Module, tracer: Optional[QuantTracer] = None
     ) -> None:
         """Inject custom forward for multi-head attention layers."""
-        module.forward = mhaForward.__get__(module)
+        # Select the appropriate forward function based on batch_first
+        if module.batch_first:
+            module.forward = mhaForwardBatchFirst.__get__(module)
+        else:
+            module.forward = mhaForwardSeqFirst.__get__(module)
 
         if tracer:
             tracer.registerNonLeafModule(QuantMultiheadAttention)
diff --git a/Tests/TestResNet50Pretrained.py b/Tests/TestResNet50Pretrained.py
diff --git a/Tests/TestVitB32.py b/Tests/TestVitB32.py