Static LLM Decoder Refactor

winskuo-quic · winskuo-quic · commit d36f7ea05635 · 2025-08-12T14:08:55.000+08:00
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4588,7 +4588,7 @@ def test_static_qwen2_5(self):
                         msg["inference_speed"], inference_speed_ref[self.model]
                     )
 
-    def test_qwen3(self):
+    def test_static_qwen3(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
 
@@ -4611,7 +4611,7 @@ def test_qwen3(self):
             "--ptq",
             "16a8w",
             "--decoder_model",
-            "qwen3_0.6b",
+            "qwen3_0_6b",
             "--model_mode",
             "hybrid",
             "--prefill_ar_len",
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -15,5 +15,7 @@
     "stories110m": "llama2",
     "llama3_2": "llama3",
     "qwen2_5": "qwen2_5",
+    "qwen3_0_6b": "qwen2_5",  # TODO: temp workaround, use special token for qwen3 in runner
+    "qwen3_1_7b": "qwen2_5",
     "phi_4_mini": "phi_4_mini",
 }
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -428,7 +428,7 @@ def compile(args, pte_filename, tokenizer):
     if args.checkpoint is None:  # HF models
         checkpoint = download_and_convert_hf_checkpoint(
             SUPPORTED_HF_MODELS[args.decoder_model].repo_id,
-            SUPPORTED_HF_MODELS[args.decoder_model].convert_weights,
+            SUPPORTED_HF_MODELS[args.decoder_model].convert_weights.__func__,
         )
         state_dict = torch.load(
             checkpoint, weights_only=True, map_location="cpu", mmap=True
diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
@@ -104,7 +104,7 @@ def __init__(self, config: ModelArgs, output_new_cache_only=False):
 
         self.scale = float(self.head_dim) ** 0.5
 
-        if hasattr(config, "enable_r3") and config.enable_r3:
+        if getattr(config, "enable_r3", False):
             self.register_buffer(
                 "r3_weight",
                 torch.tensor(
@@ -223,18 +223,20 @@ def forward_sha(  # noqa: C901
             if self.use_qk_norm and self.qk_norm_before_rope:
                 q[i] = self.q_norm_fn(q[i])
             q[i] = self.apply_rope_emb(q[i], freqs_cos, freqs_sin)
-            if hasattr(self.config, "enable_r3") and self.config.enable_r3:
-                q[i] = torch.matmul(q[i], self.r3_weight)
             if self.use_qk_norm and not self.qk_norm_before_rope:
                 q[i] = self.q_norm_fn(q[i])
+            if getattr(self.config, "enable_r3", False):
+                q[i] = torch.matmul(q[i], self.r3_weight)
+
         for i in range(len(k)):
             if self.use_qk_norm and self.qk_norm_before_rope:
                 k[i] = self.k_norm_fn(k[i])
-            k[i] = self.apply_rope_emb(k[i], freqs_cos, freqs_sin).transpose(1, 2)
-            if hasattr(self.config, "enable_r3") and self.config.enable_r3:
-                k[i] = torch.matmul(k[i], self.r3_weight)
+            k[i] = self.apply_rope_emb(k[i], freqs_cos, freqs_sin)
             if self.use_qk_norm and not self.qk_norm_before_rope:
                 k[i] = self.k_norm_fn(k[i])
+            if getattr(self.config, "enable_r3", False):
+                k[i] = torch.matmul(k[i], self.r3_weight)
+            k[i] = k[i].transpose(1, 2)
 
         output_y = []
         kh, vh = [], []
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -10,7 +10,7 @@
  * @file
  *
  * This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B, Qwen3 0.6B
- * / 1.7B phi4-mini-instruct with Qualcomm AI Engine Direct.
+ * / 1.7B, phi4-mini-instruct with Qualcomm AI Engine Direct.
  *
  */
 

Original file line number	Diff line number	Diff line change
`@@ -15,5 +15,7 @@`
`15`	`15`	`"stories110m": "llama2",`
`16`	`16`	`"llama3_2": "llama3",`
`17`	`17`	`"qwen2_5": "qwen2_5",`
	`18`	`+ "qwen3_0_6b": "qwen2_5", # TODO: temp workaround, use special token for qwen3 in runner`
	`19`	`+ "qwen3_1_7b": "qwen2_5",`
`18`	`20`	`"phi_4_mini": "phi_4_mini",`
`19`	`21`	`}`
Original file line number	Diff line number	Diff line change
`@@ -428,7 +428,7 @@ def compile(args, pte_filename, tokenizer):`
`428`	`428`	`if args.checkpoint is None: # HF models`
`429`	`429`	`checkpoint = download_and_convert_hf_checkpoint(`
`430`	`430`	`SUPPORTED_HF_MODELS[args.decoder_model].repo_id,`
`431`		`- SUPPORTED_HF_MODELS[args.decoder_model].convert_weights,`
	`431`	`+ SUPPORTED_HF_MODELS[args.decoder_model].convert_weights.__func__,`
`432`	`432`	`)`
`433`	`433`	`state_dict = torch.load(`
`434`	`434`	`checkpoint, weights_only=True, map_location="cpu", mmap=True`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`* @file`
`11`	`11`	`*`
`12`	`12`	`* This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B, Qwen3 0.6B`
`13`		`- * / 1.7B phi4-mini-instruct with Qualcomm AI Engine Direct.`
	`13`	`+ * / 1.7B, phi4-mini-instruct with Qualcomm AI Engine Direct.`
`14`	`14`	`*`
`15`	`15`	`*/`
`16`	`16`