Resolve rebase conflict

winskuo-quic · winskuo-quic · commit dde5562d24d1 · 2025-08-21T17:10:39.000+08:00
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4615,11 +4615,14 @@ def test_static_qwen3(self):
             "--decoder_model",
             "qwen3-0_6b",
             "--model_mode",
-            "hybrid",
-            "--prefill_ar_len",
-            "32",
+            "kv",
             "--max_seq_len",
-            "128",
+            "1024",
+            "--eval_perplexity",
+            "--tasks",
+            "wikitext",
+            "--limit",
+            "1",
             "--r3",
             "--enable_masked_softmax",
         ]
@@ -4634,8 +4637,6 @@ def test_static_qwen3(self):
         if self.pre_gen_pte:
             cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
 
-        # TODO: Change to PPL evaluation
-        golden_start_with = "<|im_start|>user"
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
             conn = listener.accept()
@@ -4644,12 +4645,13 @@ def test_static_qwen3(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
-                model_out = msg["result"][0]
-                self.assertTrue(
-                    model_out.startswith(golden_start_with),
-                    f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
-                )
-                self.assertGreaterEqual(msg["inference_speed"], 70)  # Lanai
+                inference_speed_ref = {"SM8650": 38, "SM8750": 56}
+                self.assertLessEqual(msg["wiki_ppl"], 18)
+                self.assertLessEqual(msg["pte_size"], 950_000_000)  # 950mb
+                if self.model in inference_speed_ref:
+                    self.assertGreaterEqual(
+                        msg["inference_speed"], inference_speed_ref[self.model]
+                    )
 
     def test_smollm2(self):
         if not self.required_envs():
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
@@ -96,7 +96,7 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL
 #### SMOLLM2
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -H mlgtw-linux -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a8w --tokenizer_bin tokenizer.bin --decoder_model smollm2 --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?"
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -H mlgtw-linux -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a8w --decoder_model smollm2_135m --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?"
 ```
 
 ### KV Cache update mechanism
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -19,23 +19,21 @@
 from executorch.examples.models.smollm2 import (
     convert_weights as convert_smollm2_weights,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
-    DECODER_MODEL_VERSION,
-)
 
 BASE_DIR = os.path.dirname(__file__)
 
 
 @dataclass(init=False, frozen=True)
 class HFModel(ABC):
-    """ Base class for all hugging face models
+    """Base class for all hugging face models
 
     repo_id: Hugging Face Repo ID.
     params_path: Path to model's config.json. If the corresponding .json has not yet exsit, please create one.
     convert_weights: Used to convert Hugging Face weights parameters to Static Decoder's parameter naming.
     transform_weight: Set to true to change HuggingFace weight to improve the performance of RoPE in HTP backend.
     instruct_model: True if the model uses chat templates. Check Hugging Face model card to ensure the model uses chat templates.
     """
+
     repo_id: str
     params_path: str
     convert_weights: Callable
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -1248,7 +1248,7 @@ def export_llama(args) -> None:
         chat_template = (
             tokenizer.apply_chat_template
             if hasattr(tokenizer, "apply_chat_template")
-            and SUPPORTED_HF_MODELS[args.decoder_model].transform_weight
+            and SUPPORTED_HF_MODELS[args.decoder_model].instruct_model
             else None
         )
         runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]

Original file line number	Diff line number	Diff line change
`@@ -1248,7 +1248,7 @@ def export_llama(args) -> None:`
`1248`	`1248`	`chat_template = (`
`1249`	`1249`	`tokenizer.apply_chat_template`
`1250`	`1250`	`if hasattr(tokenizer, "apply_chat_template")`
`1251`		`- and SUPPORTED_HF_MODELS[args.decoder_model].transform_weight`
	`1251`	`+ and SUPPORTED_HF_MODELS[args.decoder_model].instruct_model`
`1252`	`1252`	`else None`
`1253`	`1253`	`)`
`1254`	`1254`	`runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]`