optimize scale_factor and feedforward and change int32 for torken to improve embedding op

shewu-quic · shewu-quic · commit 30b31ac97982 · 2024-10-25T15:23:47.000+08:00
diff --git a/backends/qualcomm/_passes/i64_to_i32.py b/backends/qualcomm/_passes/i64_to_i32.py
@@ -61,12 +61,7 @@ def _cast_to_int32(self, graph_module: torch.fx.GraphModule):
                         to_dst_node.meta["val"] = node_val.to(torch.int32)
 
                         # Replace usage of the src dtype result with the dst dtype result.
-                        if n.name != "tokens":
-                            n.replace_all_uses_with(to_dst_node)
-                        else:
-                            for user in n.users.copy():
-                                if user.name != "quantized_decomposed_embedding_4bit_dtype":
-                                    user.replace_input_with(n, to_dst_node)
+                        n.replace_all_uses_with(to_dst_node)
                         to_dst_node.args = (n,)
 
     def call(self, graph_module: torch.fx.GraphModule):
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
@@ -62,6 +62,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.prelu.default,
         exir_ops.edge.aten.relu.default,
         exir_ops.edge.aten._softmax.default,  # TODO: Need to find a new solution to do "axis_order" to transform axis.
+        exir_ops.edge.aten.sigmoid.default,
         exir_ops.edge.aten.sqrt.default,
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.sum.dim_IntList,
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -166,8 +166,8 @@ def __init__(self, weight, bias=None):
             super().__init__()
             use_bias = bias is not None
             self.conv = torch.nn.Conv2d(
-                in_channels=weight.shape[0],
-                out_channels=weight.shape[1],
+                in_channels=weight.shape[1],
+                out_channels=weight.shape[0],
                 kernel_size=1,
                 padding=0,
                 bias=use_bias,
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -66,6 +66,7 @@
     replace_causal_mask,
     replace_kv_cache_with_coreml_kv_cache,
     replace_kv_cache_with_simple_kv_cache,
+    replace_feedforward_to_conv2d,
     replace_sdpa_with_coreml_sdpa,
     replace_sdpa_with_custom_op,
     replace_sdpa_with_flex_sdpa,
@@ -961,6 +962,7 @@ def _get_source_transforms(  # noqa
                 transforms.append(replace_attention_to_attention_sha)
                 transforms.append(replace_causal_mask)
                 transforms.append(replace_rms_norm_with_native_rms_norm)
+                transforms.append(replace_feedforward_to_conv2d)
                 transforms.append(convert_linear_to_conv2d)
             else:
                 transforms.append(replace_kv_cache_with_simple_kv_cache)
@@ -972,6 +974,7 @@ def _get_source_transforms(  # noqa
                     transforms.append(
                         get_model_with_r1_r2(args.optimized_rotation_path)
                     )
+                transforms.append(replace_feedforward_to_conv2d)
                 transforms.append(convert_linear_to_conv2d)
 
         elif args.mps:
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -245,7 +245,7 @@ def get_example_inputs_kvcache_sdpa(self):
         else:
             return (
                 torch.tensor(
-                    [[1]], dtype=torch.long
+                    [[1]], dtype=torch.int32
                 ),  # tokens, with kv cache our input token length is always just 1 token.
                 torch.tensor(
                     [0], dtype=torch.long
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
@@ -12,8 +12,9 @@
 from typing import Tuple, Union
 
 import torch
+import torch.nn.functional as F
 
-from executorch.examples.models.llama.llama_transformer import KVCache, SDPA
+from executorch.examples.models.llama.llama_transformer import KVCache, SDPA, FeedForward
 from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
     QuantizedKVCache,
 )
@@ -171,12 +172,14 @@ def __init__(
         self,
         kv_cache: KVCache,
         dim: int,
+        head_dim: int,
         n_rep: int,
     ):
         super().__init__()
         self.kv_cache = kv_cache
         self.dim = dim
         self.n_rep = n_rep
+        self.scale_factor = math.sqrt(head_dim)
 
     def forward(
         self,
@@ -195,8 +198,7 @@ def forward(
         v = repeat_kv(v, self.n_rep)
         attn_mask = mask[input_pos]
 
-        scale_factor = 1 / math.sqrt(q.size(-1))
-        attn_weight = q @ k.transpose(-2, -1) * scale_factor
+        attn_weight = q @ k.transpose(-2, -1) / self.scale_factor
         attn_weight += attn_mask
         attn_weight = torch.softmax(attn_weight, dim=-1)
         y = attn_weight @ v
@@ -223,7 +225,7 @@ def replace_sdpa_with_flex_sdpa(module: torch.nn.Module):
             setattr(
                 module,
                 name,
-                SDPAFlex(child.kv_cache, child.dim, child.n_rep),
+                SDPAFlex(child.kv_cache, child.dim, child.head_dim, child.n_rep),
             )
         else:
             replace_sdpa_with_flex_sdpa(child)
@@ -428,3 +430,50 @@ def replace_causal_mask(module: torch.nn.Module):
     for _, child in module.named_children():
         replace_causal_mask(child)
     return module
+
+class FeedForwardConv2D(torch.nn.Module):
+    def __init__(self, w1: torch.nn.Linear, w2: torch.nn.Linear, w3: torch.nn.Linear):
+        super().__init__()
+        self.w1_conv = torch.nn.Conv2d(
+            in_channels=w1.weight.shape[1],
+            out_channels=w1.weight.shape[0],
+            kernel_size=1,
+            padding=0,
+            bias=False,
+        )
+        self.w2_conv = torch.nn.Conv2d(
+            in_channels=w2.weight.shape[1],
+            out_channels=w2.weight.shape[0],
+            kernel_size=1,
+            padding=0,
+            bias=False,
+        )
+        self.w3_conv = torch.nn.Conv2d(
+            in_channels=w3.weight.shape[1],
+            out_channels=w3.weight.shape[0],
+            kernel_size=1,
+            padding=0,
+            bias=False,
+        )
+        
+        self.w1_conv.weight = torch.nn.Parameter(w1.weight.reshape(*w1.weight.shape, 1, 1))
+        self.w2_conv.weight = torch.nn.Parameter(w2.weight.reshape(*w2.weight.shape, 1, 1))
+        self.w3_conv.weight = torch.nn.Parameter(w3.weight.reshape(*w3.weight.shape, 1, 1))
+
+
+    def forward(self, x):
+        rank = x.dim()
+        x = x.unsqueeze(-1) if rank == 3 else x.reshape(1, *x.shape, 1)
+        x = torch.transpose(x, 1, 2)
+        res = self.w2_conv(F.silu(self.w1_conv(x)) * self.w3_conv(x))
+        res = torch.transpose(res, 1, 2)
+        res = res.squeeze(-1) if rank == 3 else res.reshape(*res.shape[1:3])
+        return res
+
+def replace_feedforward_to_conv2d(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, FeedForward):
+            setattr(module, name, FeedForwardConv2D(child.w1, child.w2, child.w3))
+        else:
+            replace_feedforward_to_conv2d(child)
+    return module
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -237,7 +237,7 @@ def calibrate_template(
             with torch.no_grad():
                 while token_list[-1] != tokenizer.eos_id and pos < max_len:
                     logits = module(
-                        torch.full((1, 1), token_list[pos]),
+                        torch.full((1, 1), token_list[pos], dtype=torch.int32),
                         torch.tensor((pos,)),
                     )
                     pos += 1

Original file line number	Diff line number	Diff line change
`@@ -237,7 +237,7 @@ def calibrate_template(`
`237`	`237`	`with torch.no_grad():`
`238`	`238`	`while token_list[-1] != tokenizer.eos_id and pos < max_len:`
`239`	`239`	`logits = module(`
`240`		`- torch.full((1, 1), token_list[pos]),`
	`240`	`+ torch.full((1, 1), token_list[pos], dtype=torch.int32),`
`241`	`241`	`torch.tensor((pos,)),`
`242`	`242`	`)`
`243`	`243`	`pos += 1`