pytorch
diff --git a/‎backends/qualcomm/_passes/annotate_quant_attrs.py‎
Lines changed: 29 additions & 11 deletions b/‎backends/qualcomm/_passes/annotate_quant_attrs.py‎
Lines changed: 29 additions & 11 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 18 additions & 14 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 18 additions & 14 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/README.md‎
Lines changed: 24 additions & 6 deletions b/‎examples/qualcomm/oss_scripts/llama/README.md‎
Lines changed: 24 additions & 6 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/__init__.py‎
Lines changed: 33 additions & 14 deletions b/‎examples/qualcomm/oss_scripts/llama/__init__.py‎
Lines changed: 33 additions & 14 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/decoder_constants.py‎
Lines changed: 5 additions & 3 deletions b/‎examples/qualcomm/oss_scripts/llama/decoder_constants.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/decoder_utils.py‎
Lines changed: 34 additions & 7 deletions b/‎examples/qualcomm/oss_scripts/llama/decoder_utils.py‎
Lines changed: 34 additions & 7 deletions
@@ -30,9 +30,14 @@ class AnnotateQuantAttrs(ExportPass):
     generated after quantization process.
     """
 
-    def __init__(self, edge_program: torch.export.ExportedProgram):
+    def __init__(
+        self,
+        edge_program: torch.export.ExportedProgram,
+        skip_advanced_requant: bool = False,
+    ):
         super(AnnotateQuantAttrs, self).__init__()
         self.edge_program = edge_program
+        self.skip_advanced_requant = skip_advanced_requant
 
     def _annotate_source_nodes(
         self, quant_node: torch.fx.Node, quant_attrs: Dict[str, Any]
@@ -82,16 +87,29 @@ def _annotate_requant(self, n):
                 # TODO: Store multiple pairs of requantize attributes when we have an op builder
                 # that has multiple outputs that requires quant attributes.
 
-                if any(
-                    q_attrs[attr] != dq_attrs[attr]
-                    for attr in [
-                        QCOM_SCALE,
-                        QCOM_ZERO_POINT,
-                        QCOM_QUANT_MIN,
-                        QCOM_QUANT_MAX,
-                        QCOM_DTYPE,
-                    ]
-                ):
+                # Determine if requantization is needed based on configuration and attribute mismatch.
+                is_requant_needed = False
+                if self.skip_advanced_requant:
+                    # In skip_advanced_requant mode, only consider requant if dtypes differ.
+                    if q_attrs[QCOM_DTYPE] != dq_attrs[QCOM_DTYPE]:
+                        is_requant_needed = True
+                else:
+                    # In full requant mode, consider requant if any key attribute differs.
+                    # This aims to improve accuracy by adjusting scale, zero_point, etc.
+                    # Users can disable this if it causes regressions.
+                    if any(
+                        q_attrs[attr] != dq_attrs[attr]
+                        for attr in [
+                            QCOM_SCALE,
+                            QCOM_ZERO_POINT,
+                            QCOM_QUANT_MIN,
+                            QCOM_QUANT_MAX,
+                            QCOM_DTYPE,
+                        ]
+                    ):
+                        is_requant_needed = True
+
+                if is_requant_needed:
                     dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
                     user_node = list(dq_node.users)[0]
                     n.args[0].meta.setdefault(QCOM_REQUANTIZE, {})
 
@@ -4564,7 +4564,7 @@ def test_static_qwen2_5(self):
             "--ptq",
             "16a8w",
             "--decoder_model",
-            "qwen2_5",
+            "qwen2_5-0_5b",
             "--model_mode",
             "kv",
             "--max_seq_len",
@@ -4627,13 +4627,18 @@ def test_static_qwen3(self):
             "--ptq",
             "16a8w",
             "--decoder_model",
-            "qwen3_0_6b",
+            "qwen3-0_6b",
             "--model_mode",
-            "hybrid",
-            "--prefill_ar_len",
-            "32",
+            "kv",
             "--max_seq_len",
-            "128",
+            "1024",
+            "--eval_perplexity",
+            "--tasks",
+            "wikitext",
+            "--limit",
+            "1",
+            "--r3",
+            "--enable_masked_softmax",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
@@ -4646,8 +4651,6 @@ def test_static_qwen3(self):
         if self.pre_gen_pte:
             cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
 
-        # Accuracy is bad for now. Just check user's prompt is returned.
-        golden_start_with = "My favourite condiment is "
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
             conn = listener.accept()
@@ -4656,12 +4659,13 @@ def test_static_qwen3(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                model_out = msg["result"][0]
-                self.assertTrue(
-                    model_out.startswith(golden_start_with),
-                    f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
-                )
-                self.assertGreaterEqual(msg["inference_speed"], 70)  # Lanai
+                inference_speed_ref = {"SM8650": 38, "SM8750": 56}
+                self.assertLessEqual(msg["wiki_ppl"], 18)
+                self.assertLessEqual(msg["pte_size"], 950_000_000)  # 950mb
+                if self.model in inference_speed_ref:
+                    self.assertGreaterEqual(
+                        msg["inference_speed"], inference_speed_ref[self.model]
+                    )
 
     def test_smollm2(self):
         if not self.required_envs():
 
@@ -5,7 +5,7 @@ This file provides you the instructions to run LLM Decoder model with different
  1. LLAMA2 Stories 110M
  2. LLAMA3.2 1B
  3. LLAMA3.2 3B
- 4. QWEN2.5 0.5B
+ 4. QWEN2.5 0.5B / 1.5B
  5. QWEN3 0.6B / 1.7B
  6. Phi4-mini-instruct
  7. SMOLLM2 135M
@@ -72,13 +72,31 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL
 #### QWEN2.5 0.5B
 Default example using hybrid mode
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --ptq 16a8w --enable_masked_softmax --r3 --decoder_model qwen2_5 --prompt "I would like to learn python, could you teach me with a simple example?"
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --ptq 16a8w --enable_masked_softmax --r3 --decoder_model qwen2_5-0_5b --prompt "I would like to learn python, could you teach me with a simple example?"
+```
+
+#### QWEN2.5 1.5B
+Default example using hybrid mode
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --ptq 16a8w --enable_masked_softmax --r3 --decoder_model qwen2_5-1_5b --prompt "I would like to learn python, could you teach me with a simple example?"
+```
+
+#### QWEN3 0.6B
+Default example using hybrid mode
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --ptq 16a8w --enable_masked_softmax --r3 --decoder_model qwen3-0_6b --prompt "I would like to learn python, could you teach me with a simple example?"
+```
+
+#### QWEN3 1.7B
+Default example using hybrid mode
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --ptq 16a8w --enable_masked_softmax --r3 --decoder_model qwen3-1_7b --prompt "I would like to learn python, could you teach me with a simple example?"
 ```
 
 #### SMOLLM2
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -H mlgtw-linux -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a8w --tokenizer_bin tokenizer.bin --decoder_model smollm2 --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?"
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -H mlgtw-linux -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a8w --decoder_model smollm2_135m --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?"
 ```
 
 ### KV Cache update mechanism
@@ -175,18 +193,18 @@ To evaluate the perplexity across all 3 phases, users should provide the `--eval
 
 For example, using the Qwen model and 1 wikitext sample as the evaluation task, users can assess all 3 phases perplexity score in a single run by including the appropriate configuration:
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --ptq 16a8w --decoder_model qwen2_5 --eval_perplexity --tasks wikitext --limit 1
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --ptq 16a8w --decoder_model qwen2_5-0_5b --eval_perplexity --tasks wikitext --limit 1
 ```
 
 For the example script above, 1 wikitext sample is used to evaluate all 3 phases. However, there are cases where a user may want to use one sample for quantization calibration and multiple samples for perplexity evaluation. In this case, the process should be split into two runs. In the 1st run, the model is compiled using one sample. In the 2nd run, the user can provide a different configuration for QNN device execution.
 Example:
 ```bash
 # 1st run to compile with --limit 1
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --ptq 16a8w --decoder_model qwen2_5 --eval_perplexity --tasks wikitext --limit 1 --compile_only
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --ptq 16a8w --decoder_model qwen2_5-0_5b --eval_perplexity --tasks wikitext --limit 1 --compile_only
 ```
 ```bash
 # 2nd run to perform QNN device execution with --limit 3
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --ptq 16a8w --decoder_model qwen2_5 --eval_perplexity --tasks wikitext --limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --prompt "What is 1+1?" --temperature 0 --model_mode kv --max_seq_len 1024 --ptq 16a8w --decoder_model qwen2_5-0_5b --eval_perplexity --tasks wikitext --limit 3 --pre_gen_pte ${PATH_TO_ARTIFACT_IN_1ST_RUN} --quant_attrs_path ${PATH_TO_ARTIFACT_IN_1ST_RUN}/kv_llama_qnn_quant_attrs.json
 ```
 
 #### Tasks quantization calibration
 
@@ -6,7 +6,7 @@
 
 import os
 from abc import ABC
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Callable, Dict, Type
 
 from executorch.examples.models.phi_4_mini import (
@@ -19,19 +19,26 @@
 from executorch.examples.models.smollm2 import (
     convert_weights as convert_smollm2_weights,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
-    DECODER_MODEL_VERSION,
-)
 
 BASE_DIR = os.path.dirname(__file__)
 
 
 @dataclass(init=False, frozen=True)
 class HFModel(ABC):
+    """Base class for all hugging face models
+
+    repo_id: Hugging Face Repo ID.
+    params_path: Path to model's config.json. If the corresponding .json has not yet exsit, please create one.
+    convert_weights: Used to convert Hugging Face weights parameters to Static Decoder's parameter naming.
+    transform_weight: Set to true to change HuggingFace weight to improve the performance of RoPE in HTP backend.
+    instruct_model: True if the model uses chat templates. Check Hugging Face model card to ensure the model uses chat templates.
+    """
+
     repo_id: str
     params_path: str
-    runner_version: str
     convert_weights: Callable
+    transform_weight: bool
+    instruct_model: bool
 
 
 SUPPORTED_HF_MODELS: Dict[str, HFModel] = {}
@@ -45,40 +52,52 @@ def decorator(cls: Type[HFModel]):
     return decorator
 
 
-@register_hf_model("qwen2_5")
+@register_hf_model("qwen2_5-0_5b")
 @dataclass(init=False, frozen=True)
-class Qwen2_5(HFModel):
+class Qwen2_5_0_5B(HFModel):
     repo_id: str = "Qwen/Qwen2.5-0.5B"
     params_path: str = os.path.join(
         BASE_DIR, "../../../models/qwen2_5/config/0_5b_config.json"
     )
-    runner_version: str = field(default=DECODER_MODEL_VERSION["qwen2_5"])
     convert_weights = convert_qwen2_5_weights
     transform_weight = False
+    instruct_model = False
+
+
+@register_hf_model("qwen2_5-1_5b")
+@dataclass(init=False, frozen=True)
+class Qwen2_5_1_5B(HFModel):
+    repo_id: str = "Qwen/Qwen2.5-1.5B"
+    params_path: str = os.path.join(
+        BASE_DIR, "../../../models/qwen2_5/config/1_5b_config.json"
+    )
+    convert_weights = convert_qwen2_5_weights
+    transform_weight = False
+    instruct_model = False
 
 
-@register_hf_model("qwen3_0_6b")
+@register_hf_model("qwen3-0_6b")
 @dataclass(init=False, frozen=True)
 class Qwen3_0_6B(HFModel):
     repo_id: str = "Qwen/Qwen3-0.6B"
     params_path: str = os.path.join(
         BASE_DIR, "../../../models/qwen3/config/0_6b_config.json"
     )
-    runner_version: str = field(default=DECODER_MODEL_VERSION["qwen2_5"])
     convert_weights = convert_qwen3_weights
     transform_weight = False
+    instruct_model = True
 
 
-@register_hf_model("qwen3_1_7b")
+@register_hf_model("qwen3-1_7b")
 @dataclass(init=False, frozen=True)
 class Qwen3_1_7B(HFModel):
     repo_id: str = "Qwen/Qwen3-1.7B"
     params_path: str = os.path.join(
         BASE_DIR, "../../../models/qwen3/config/1_7b_config.json"
     )
-    runner_version: str = field(default=DECODER_MODEL_VERSION["qwen2_5"])
     convert_weights = convert_qwen3_weights
     transform_weight = False
+    instruct_model = True
 
 
 @register_hf_model("phi_4_mini")
@@ -88,9 +107,9 @@ class Phi4Mini(HFModel):
     params_path: str = os.path.join(
         BASE_DIR, "../../../models/phi_4_mini/config/config.json"
     )
-    runner_version: str = field(default=DECODER_MODEL_VERSION["phi_4_mini"])
     convert_weights = convert_phi_4_mini_weights
     transform_weight = False
+    instruct_model = True
 
 
 @register_hf_model("smollm2_135m")
@@ -100,6 +119,6 @@ class Smollm2_135M(HFModel):
     params_path: str = os.path.join(
         BASE_DIR, "../../../models/smollm2/135M_config.json"
     )
-    runner_version: str = field(default=DECODER_MODEL_VERSION["smollm2_135m"])
     convert_weights = convert_smollm2_weights
     transform_weight = True
+    instruct_model = True
@@ -10,13 +10,15 @@
     "lookahead": 2,
 }
 
+# The dict's value is mainly for runner to decide what special tokens are required to wrap the prompt.
 DECODER_MODEL_VERSION = {
     "stories260k": "llama2",
     "stories110m": "llama2",
     "llama3_2": "llama3",
-    "qwen2_5": "qwen2_5",
-    "qwen3_0_6b": "qwen2_5",  # TODO: temp workaround, use special token for qwen3 in runner
-    "qwen3_1_7b": "qwen2_5",
+    "qwen2_5-0_5b": "qwen2_5",
+    "qwen2_5-1_5b": "qwen2_5",
+    "qwen3-0_6b": "qwen3",
+    "qwen3-1_7b": "qwen3",
     "phi_4_mini": "phi_4_mini",
     "smollm2_135m": "smollm2_135m",
 }
@@ -458,22 +458,34 @@ def prefill_inference(
 
 
 def graph_module_inference(
-    args,
-    use_kv_cache,
+    use_kv_cache: bool,
     get_example_inputs: Callable,
     module: torch.fx.GraphModule,
     tokenizer,
     ar_len=1,
     max_seq_len=512,
     kv_updater=smart_mask_updater,
+    prompt=None,
+    tasks=None,
+    tasks_limit=1,
+    num_fewshot=None,
     use_i64_token=False,
     event_name: Optional[str] = None,
 ):
-    if args.tasks is None:
+    """
+    This function supports model execution from static nn.Module decoder model
+    all the way to edge program.
+    Users could choose to provide either the prompt or tasks for execution but not both.
+    """
+    # Checks 1 and only 1 is provided.
+    assert (tasks is None) != (
+        prompt is None
+    ), "Please provide either tasks or prompt - not both or neither"
+    if tasks is None:
         if use_kv_cache:
             kv_inference(
                 get_example_inputs,
-                args.prompt[0],
+                prompt,
                 module,
                 tokenizer,
                 ar_len,
@@ -485,7 +497,7 @@ def graph_module_inference(
         else:
             prefill_inference(
                 get_example_inputs,
-                args.prompt[0],
+                prompt,
                 module,
                 tokenizer,
                 max_seq_len,
@@ -507,9 +519,24 @@ def graph_module_inference(
         with torch.no_grad():
             eval_results = simple_evaluate(
                 model=calibration_wrapper,
-                tasks=args.tasks,
-                limit=args.limit,
+                tasks=tasks,
+                num_fewshot=num_fewshot,
+                limit=tasks_limit,
             )
         logging.info(f"Perplexity evaluation summary for {event_name}")
         for task, res in eval_results["results"].items():
             logging.info(f"{task}: {res}")
+
+
+def apply_prompt_template(
+    chat_template: Callable, prompt: str, system_prompt: str = None
+):
+    messages = [{"role": "user", "content": prompt}]
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+
+    template_prompt = chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    logging.info(f"Prompt after applying template: {template_prompt}")
+    return template_prompt