enable_qnn_masked_softmax

billmguo · web-flow · commit b8b2ecb3affa · 2025-08-29T12:01:14.000-07:00
Differential Revision: D81248699 Pull Request resolved: pytorch#13788
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -331,6 +331,7 @@ def __init__(
         args: ModelArgs,
         layer_id: int,
         rope: Rope,
+        **_kwargs: Any,
     ):
         """
         Multi-head attention layer.
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -121,7 +121,7 @@ def from_type(cls, layer_id, args, rope) -> "TransformerBlock":
                 f"Available: {list(ATTENTION_REGISTRY.keys())}"
             )
         cls = ATTENTION_REGISTRY[args.attention_type]
-        attention = cls(args, layer_id, rope)
+        attention = cls(args, layer_id, rope, **args.attention_kwargs)
         return TransformerBlock(args, attention)
 
     def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x: 1xN
@@ -255,7 +255,7 @@ def construct_transformer(model_args: ModelArgs) -> Transformer:
     layers = torch.nn.ModuleList()
     cls = ATTENTION_REGISTRY[model_args.attention_type]
     for layer_id in range(model_args.n_layers):
-        attention = cls(model_args, layer_id, rope)
+        attention = cls(model_args, layer_id, rope, **model_args.attention_kwargs)
         transformer_block = TransformerBlock(model_args, attention)
         layers.append(transformer_block)
 
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -1,5 +1,6 @@
+import dataclasses
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 
 
 @dataclass
@@ -69,6 +70,7 @@ class ModelArgs:
     kv_io_bit_width: Optional[int] = (
         None  # KV cache bit width. This is for QNN backend only for now.
     )
+    attention_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
 
     def __post_init__(self):
         if self.n_kv_heads is None:
diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
@@ -658,7 +658,12 @@ class StaticAttention(Attention):
     """
 
     def __init__(
-        self, config: ModelArgs, layer_id: int, rope: Rope, split_mha: bool = True
+        self,
+        config: ModelArgs,
+        layer_id: int,
+        rope: Rope,
+        split_mha: bool = True,
+        **kwargs: Any,
     ):
         super().__init__()
         self.n_heads = config.n_heads
@@ -676,6 +681,7 @@ def __init__(
         self.qk_norm_before_rope = config.qk_norm_before_rope
         self.split_mha = split_mha
         self.use_conv2d = False
+        self.enable_qnn_masked_softmax = kwargs.get("enable_qnn_masked_softmax", False)
 
         if self.split_mha:
             self.wqs = nn.ModuleList(
@@ -857,7 +863,14 @@ def _forward_sha(
             kv_idx = i // self.n_heads_per_kv_group
             attn = new_qs[i] @ all_ks[kv_idx].transpose(-2, -1)
             attn = attn * self.inv_scale
-            attn = attn + mask
+            if self.enable_qnn_masked_softmax:
+                attn_min = torch.amin(attn, dim=-1, keepdim=True)
+                minus_value = -20
+                attn = torch.where(
+                    mask == 0, attn, attn_min + minus_value
+                )  # prye-ignore
+            else:
+                attn = attn + mask
             attn = F.softmax(attn, dim=-1)
             heads.append(attn @ all_vs[kv_idx])