Add wq,wk,wv,wo and remove_unused_input pass

Sanggyu Lee · Sanggyu Lee · commit a255e2ff9e41 · 2025-07-18T13:43:26.000+09:00
diff --git a/test/modules/model/LlamaDecoderLayerWithKVCacheAndFusedAttention/layer.py b/test/modules/model/LlamaDecoderLayerWithKVCacheAndFusedAttention/layer.py
@@ -85,6 +85,10 @@ def populate_args(args_dict, filter):
 @torch.library.impl("circle::attention.llama", "CPU")
 def attention_llama_cpu(
     hidden_states,
+    q_proj,
+    k_proj,
+    v_proj,
+    o_proj,
     position_cos,
     position_sin,
     attention_mask,
@@ -100,6 +104,10 @@ def attention_llama_cpu(
 def attention_llama(*args, **kwargs):
     (
         hidden_states,
+        q_proj,
+        k_proj,
+        v_proj,
+        o_proj,
         position_cos,
         position_sin,
         attention_mask,
@@ -131,6 +139,10 @@ def forward_adapter(
     return (
         torch.ops.circle.attention.llama(
             hidden_states,
+            self.q_proj.weight,
+            self.k_proj.weight,
+            self.v_proj.weight,
+            self.o_proj.weight,
             position_embeddings[0],  # cos
             position_embeddings[1],  # sin
             attention_mask,
@@ -155,4 +167,4 @@ def forward_adapter(
 model = AutoModelForCausalLM.from_pretrained(model_name)
 model.eval()
 circle_model = tico.convert(model.model.layers[0], captured_input)
-circle_model.save(f"tinyllama.attn.circle")
+circle_model.save(f"tinyllama.layer.attn.circle")
diff --git a/test/modules/model/LlamaDecoderLayerWithKVCacheAndFusedAttention/model.py b/test/modules/model/LlamaDecoderLayerWithKVCacheAndFusedAttention/model.py
@@ -87,6 +87,10 @@ def populate_args(args_dict, filter):
 @torch.library.impl("circle::attention.llama", "CPU")
 def attention_llama_cpu(
     hidden_states,
+    q_proj,
+    k_proj,
+    v_proj,
+    o_proj,
     position_cos,
     position_sin,
     attention_mask,
@@ -102,6 +106,10 @@ def attention_llama_cpu(
 def attention_llama(*args, **kwargs):
     (
         hidden_states,
+        q_proj,
+        k_proj,
+        v_proj,
+        o_proj,
         position_cos,
         position_sin,
         attention_mask,
@@ -133,6 +141,10 @@ def forward_adapter(
     return (
         torch.ops.circle.attention.llama(
             hidden_states,
+            self.q_proj.weight,
+            self.k_proj.weight,
+            self.v_proj.weight,
+            self.o_proj.weight,
             position_embeddings[0],  # cos
             position_embeddings[1],  # sin
             attention_mask,
diff --git a/tico/passes/remove_unused_inputs.py b/tico/passes/remove_unused_inputs.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import torch.fx
+import torch
+from torch.export import ExportedProgram
+
+from tico.passes import ops
+from tico.utils import logging
+from tico.utils.passes import PassBase, PassResult
+from tico.utils.trace_decorators import trace_graph_diff_on_pass
+
+
+@trace_graph_diff_on_pass
+class RemoveUnusedInput(PassBase):
+    """
+    Let's remove dead inputs
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def call(self, exported_program: ExportedProgram) -> PassResult:
+        logger = logging.getLogger(__name__)
+
+        graph_module = exported_program.graph_module
+        graph = graph_module.graph
+        modified = False
+        for node in graph.nodes:
+            if node.op == "placeholder" and len(node.users) == 0:
+                graph.erase_node(node)
+                modified = True
+
+        graph.lint()
+        graph_module.recompile()
+
+        return PassResult(modified)
diff --git a/tico/serialize/operators/op_circle_attention.py b/tico/serialize/operators/op_circle_attention.py
@@ -32,6 +32,10 @@
     """
 attention.llama(
     Tensor hidden_states,
+    Tensor wq,
+    Tensor wk,
+    Tensor wv,
+    Tensor wo,
     Tensor position_cos,
     Tensor position_sin,
     Tensor? attention_mask,
@@ -59,6 +63,10 @@ def define_node(
     ) -> circle.Operator.OperatorT:
         (
             hidden_states,
+            wq,
+            wk,
+            wv,
+            wo,
             position_cos,
             position_sin,
             attention_mask,
@@ -68,9 +76,6 @@ def define_node(
             layer_idx,
         ) = node.args
 
-        inputs = node.args
-        outputs = [node]
-
         op_index = get_op_index(
             circle.BuiltinOperator.BuiltinOperator.ATTENTION, self._op_codes
         )
diff --git a/tico/utils/convert.py b/tico/utils/convert.py
@@ -63,6 +63,7 @@
 from tico.passes.lower_to_slice import passes as LowerToSlicePasses
 from tico.passes.merge_consecutive_cat import MergeConsecutiveCat
 from tico.passes.remove_nop import RemoveNop
+from tico.passes.remove_unused_inputs import RemoveUnusedInput
 from tico.passes.remove_redundant_assert_nodes import RemoveRedundantAssertionNodes
 from tico.passes.remove_redundant_expand import RemoveRedundantExpand
 from tico.passes.remove_redundant_permute import passes as RemoveRedundantPermutePasses
@@ -251,6 +252,7 @@ def convert_exported_module_to_circle(
             ConvertConv1dToConv2d(),
             *LowerToSlicePasses(),
             FuseLeadingUnsqueezeReshape(),
+            RemoveUnusedInput(),
         ]
     )
     circle_legalize.run(exported_program)