Qualcomm AI Engine Direct - Static LLM Decoder Refactor (#13314)

winskuo-quic · web-flow · commit cf669e3f1345 · 2025-08-15T21:21:17.000-07:00
### Summary
- Update UT name
- Revert R3 changes to original behavior
- Minor refactor on code logic.

### Test plan
NA
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4590,7 +4590,7 @@ def test_static_qwen2_5(self):
                         msg["inference_speed"], inference_speed_ref[self.model]
                     )
 
-    def test_qwen3(self):
+    def test_static_qwen3(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
 
@@ -4613,7 +4613,7 @@ def test_qwen3(self):
             "--ptq",
             "16a8w",
             "--decoder_model",
-            "qwen3_0.6b",
+            "qwen3_0_6b",
             "--model_mode",
             "hybrid",
             "--prefill_ar_len",
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -68,7 +68,7 @@ class Qwen3_0_6B(HFModel):
 @register_hf_model("qwen3_1_7b")
 @dataclass(init=False, frozen=True)
 class Qwen3_1_7B(HFModel):
-    repo_id: str = "Qwen/Qwen/Qwen3-1.7B"
+    repo_id: str = "Qwen/Qwen3-1.7B"
     params_path: str = os.path.join(
         BASE_DIR, "../../../models/qwen3/config/1_7b_config.json"
     )
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -15,5 +15,7 @@
     "stories110m": "llama2",
     "llama3_2": "llama3",
     "qwen2_5": "qwen2_5",
+    "qwen3_0_6b": "qwen2_5",  # TODO: temp workaround, use special token for qwen3 in runner
+    "qwen3_1_7b": "qwen2_5",
     "phi_4_mini": "phi_4_mini",
 }
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -429,7 +429,7 @@ def compile(args, pte_filename, tokenizer):
     if args.checkpoint is None:  # HF models
         checkpoint = download_and_convert_hf_checkpoint(
             SUPPORTED_HF_MODELS[args.decoder_model].repo_id,
-            SUPPORTED_HF_MODELS[args.decoder_model].convert_weights,
+            SUPPORTED_HF_MODELS[args.decoder_model].convert_weights.__func__,
         )
         state_dict = torch.load(
             checkpoint, weights_only=True, map_location="cpu", mmap=True
diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
@@ -104,7 +104,7 @@ def __init__(self, config: ModelArgs, output_new_cache_only=False):
 
         self.scale = float(self.head_dim) ** 0.5
 
-        if hasattr(config, "enable_r3") and config.enable_r3:
+        if getattr(config, "enable_r3", False):
             self.register_buffer(
                 "r3_weight",
                 torch.tensor(
@@ -223,18 +223,20 @@ def forward_sha(  # noqa: C901
             if self.use_qk_norm and self.qk_norm_before_rope:
                 q[i] = self.q_norm_fn(q[i])
             q[i] = self.apply_rope_emb(q[i], freqs_cos, freqs_sin)
-            if hasattr(self.config, "enable_r3") and self.config.enable_r3:
-                q[i] = torch.matmul(q[i], self.r3_weight)
             if self.use_qk_norm and not self.qk_norm_before_rope:
                 q[i] = self.q_norm_fn(q[i])
+            if getattr(self.config, "enable_r3", False):
+                q[i] = torch.matmul(q[i], self.r3_weight)
+
         for i in range(len(k)):
             if self.use_qk_norm and self.qk_norm_before_rope:
                 k[i] = self.k_norm_fn(k[i])
-            k[i] = self.apply_rope_emb(k[i], freqs_cos, freqs_sin).transpose(1, 2)
-            if hasattr(self.config, "enable_r3") and self.config.enable_r3:
-                k[i] = torch.matmul(k[i], self.r3_weight)
+            k[i] = self.apply_rope_emb(k[i], freqs_cos, freqs_sin)
             if self.use_qk_norm and not self.qk_norm_before_rope:
                 k[i] = self.k_norm_fn(k[i])
+            if getattr(self.config, "enable_r3", False):
+                k[i] = torch.matmul(k[i], self.r3_weight)
+            k[i] = k[i].transpose(1, 2)
 
         output_y = []
         kh, vh = [], []
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -10,7 +10,7 @@
  * @file
  *
  * This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B, Qwen3 0.6B
- * / 1.7B phi4-mini-instruct with Qualcomm AI Engine Direct.
+ * / 1.7B, phi4-mini-instruct with Qualcomm AI Engine Direct.
  *
  */