pytorch
diff --git a/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 27 additions & 40 deletions b/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 27 additions & 40 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 0 additions & 24 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 0 additions & 24 deletions
@@ -92,9 +92,12 @@ def annotate_mimi_decoder(gm: torch.fx.GraphModule):
             break
 
 
-def annotate_linear_16a8w_in_affine_layer(
-    gm: torch.fx.GraphModule, is_qat: bool = False
-) -> None:
+def annotate_output_16a8w(gm: torch.fx.GraphModule, is_qat: bool = False) -> None:
+    """
+    This function is for static LLM models.
+    This function will annotate the last conv(linear), which is the lm_head, as 16a8w.
+    """
+
     def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
         input_qspec_map = {}
         input_act = node.args[0]
@@ -163,11 +166,30 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
                 )
 
 
-def annotate_matmul_16a8w(  # noqa: C901
+def annotate_wv_sha(gm: torch.fx.GraphModule, quantization_config: QuantizationConfig):
+    for node in gm.graph.nodes:
+        if (
+            node.target == torch.ops.aten.conv2d.default
+            and "wv_sha" in node.meta["stack_trace"]
+        ):
+            input_qspec_map = {}
+            input_qspec_map[node.args[0]] = quantization_config.input_activation
+            input_qspec_map[node.args[1]] = quantization_config.weight
+            if len(node.args) > 2 and isinstance(node.args[2], Node):
+                input_qspec_map[node.args[2]] = quantization_config.bias(node)
+            node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=quantization_config.output_activation,
+                _annotated=True,
+            )
+
+
+def annotate_kv_8bit(  # noqa: C901
     gm: torch.fx.GraphModule,
     is_qat=False,
 ) -> None:
     """
+    This function is for static LLM models.
     This function is specific for matmul op 16a8w.
     For k, we will tag such as the below, and
     for v, we will tag 8a until conv op.
@@ -213,25 +235,6 @@ def annotate_cat(node: Node, quantization_config: QuantizationConfig):
             _annotated=True,
         )
 
-    def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
-        input_qspec_map = {}
-        input_act = node.args[0]
-        input_spec = quantization_config.input_activation
-        input_qspec_map[input_act] = input_spec
-
-        weight = node.args[1]
-        input_qspec_map[weight] = quantization_config.weight
-
-        if len(node.args) > 2 and isinstance(node.args[2], Node):
-            bias = node.args[2]
-            input_qspec_map[bias] = quantization_config.bias(node)
-
-        node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
-            input_qspec_map=input_qspec_map,
-            output_qspec=quantization_config.output_activation,
-            _annotated=True,
-        )
-
     def annotate_rms_norm(node: Node, quantization_config: QuantizationConfig) -> None:
         act_node = node.args[0]
         weight_node = node.args[2]
@@ -301,22 +304,10 @@ def annotate_matmul_input1(node: Node, is_qat: str):
             quantization_config_8a8w = get_8a8w_qnn_qat_config(
                 act_symmetric=True, act_observer=MinMaxObserver
             )
-            quantization_config_8a4w_per_channel = get_qat_per_channel_quant_config(
-                act_dtype=torch.uint8,
-                weight_dtype=torch.int4,
-                act_observer=MinMaxObserver,
-                act_symmetric=True,
-            )
         else:
             quantization_config_8a8w = get_8a8w_qnn_ptq_config(
                 act_symmetric=True, act_observer=MinMaxObserver
             )
-            quantization_config_8a4w_per_channel = get_ptq_per_channel_quant_config(
-                act_dtype=torch.uint8,
-                weight_dtype=torch.int4,
-                act_observer=MinMaxObserver,
-                act_symmetric=True,
-            )
         while isinstance(node, Node) and node.op == "call_function":
             if node.target in [
                 torch.ops.aten.permute.default,
@@ -343,15 +334,11 @@ def annotate_matmul_input1(node: Node, is_qat: str):
                 # For k, we tag 8a until add or sub op (rotatary embedding).
                 # The arguments of cat op: (the past kv cache, the new kv cache)
                 node = node.args[0][1]
-            elif node.target == torch.ops.aten.conv2d.default:
-                annotate_conv2d(
-                    node, quantization_config=quantization_config_8a4w_per_channel
-                )
-                break
             elif node.target in [
                 torch.ops.aten.add.Tensor,
                 torch.ops.aten.sub.Tensor,
                 torch.ops.aten.matmul.default,
+                torch.ops.aten.conv2d.default,
             ]:
                 break
             else:
 
@@ -4582,8 +4582,6 @@ def test_llama3_2_1b(self):
             str(self.port),
             "--prompt",
             f"{prompt}",
-            "--ptq",
-            "16a4w",
             "--temperature",
             "0",
             "--decoder_model",
@@ -4594,8 +4592,6 @@ def test_llama3_2_1b(self):
             "32",
             "--max_seq_len",
             "512",
-            "--num_sharding",
-            "4",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
@@ -4662,8 +4658,6 @@ def test_llama_stories_260k(self):
             str(self.port),
             "--prompt",
             f"{prompt}",
-            "--ptq",
-            "16a4w",
             "--temperature",
             "0",
             "--decoder_model",
@@ -4740,8 +4734,6 @@ def test_llama_stories_110m(self):
             str(self.port),
             "--prompt",
             f"{prompt}",
-            "--ptq",
-            "16a4w",
             "--temperature",
             "0",
             "--decoder_model",
@@ -4806,18 +4798,12 @@ def test_static_phi4(self):
             str(self.port),
             "--prompt",
             f"{prompt}",
-            "--ptq",
-            "16a4w_block",
-            "--group_size",
-            "16",
             "--decoder_model",
             "phi_4_mini",
             "--model_mode",
             "kv",
             "--max_seq_len",
             "1024",
-            "--num_sharding",
-            "8",
             "--eval_perplexity",
             "--tasks",
             "wikitext",
@@ -4877,8 +4863,6 @@ def test_static_qwen2_5(self):
             str(self.port),
             "--prompt",
             f"{prompt}",
-            "--ptq",
-            "16a8w",
             "--decoder_model",
             "qwen2_5-0_5b",
             "--model_mode",
@@ -4890,8 +4874,6 @@ def test_static_qwen2_5(self):
             "wikitext",
             "--limit",
             "1",
-            "--r3",
-            "--enable_masked_softmax",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
@@ -4940,8 +4922,6 @@ def test_static_qwen3(self):
             str(self.port),
             "--prompt",
             f"{prompt}",
-            "--ptq",
-            "16a8w",
             "--decoder_model",
             "qwen3-0_6b",
             "--model_mode",
@@ -4953,8 +4933,6 @@ def test_static_qwen3(self):
             "wikitext",
             "--limit",
             "1",
-            "--r3",
-            "--enable_masked_softmax",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
@@ -5003,8 +4981,6 @@ def test_static_smollm2(self):
             str(self.port),
             "--prompt",
             f"{prompt}",
-            "--ptq",
-            "16a8w",
             "--decoder_model",
             "smollm2_135m",
             "--model_mode",