Qualcomm AI Engine Direct - GA Static Smollm2 (pytorch#13406)

chenweng-quic · web-flow · commit c3108748bed2 · 2025-08-19T16:59:18.000-07:00
### Summary Summary <img width="1607" height="1117" alt="image" src="https://github.com/user-attachments/assets/acefe148-cfca-42e7-9ea1-07e2df7bd72b" /> ### Test plan ``` python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -H <host> -s <device_id> -m SM8650 --ptq 16a8w --decoder_model smollm2_135m --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "What is the capital of France." python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_smollm2 --device <device_id> --host <host> --model <soc_model> --build_folder build-android --executorch_root . --artifact all_artifact ``` --------- Co-authored-by: Cheng-Hsin Weng <chenweng@qti.qualcomm.com>
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4649,6 +4649,64 @@ def test_static_qwen3(self):
                 )
                 self.assertGreaterEqual(msg["inference_speed"], 70)  # Lanai
 
+    def test_smollm2(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "My favourite condiment is "
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--ptq",
+            "16a8w",
+            "--decoder_model",
+            "smollm2_135m",
+            "--model_mode",
+            "kv",
+            "--temperature",
+            "0",
+            "--prefill_ar_len",
+            "128",
+            "--max_seq_len",
+            "1024",
+            "--eval_perplexity",
+            "--task",
+            "wikitext",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertLessEqual(msg["wiki_ppl"], 25)
+                self.assertGreaterEqual(msg["inference_speed"], 200)
+
 
 class TestExampleOssScript(TestQNN):
     def test_albert(self):
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
@@ -8,6 +8,7 @@ This file provides you the instructions to run LLM Decoder model with different
  4. QWEN2.5 0.5B
  5. QWEN3 0.6B / 1.7B
  6. Phi4-mini-instruct
+ 7. SMOLLM2 135M
 
 We offer the following modes to execute the model:
 
@@ -74,6 +75,12 @@ Default example using hybrid mode
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --ptq 16a8w --enable_masked_softmax --r3 --decoder_model qwen2_5 --prompt "I would like to learn python, could you teach me with a simple example?"
 ```
 
+#### SMOLLM2
+Default example using hybrid mode.
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -H mlgtw-linux -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a8w --tokenizer_bin tokenizer.bin --decoder_model smollm2 --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?"
+```
+
 ### KV Cache update mechanism
 We have two distinct mechanisms for updating the key-value (KV) cache, which can be selected at runtime. Shift Pointer and Smart Mask.
 
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -16,7 +16,9 @@
     convert_weights as convert_qwen2_5_weights,
 )
 from executorch.examples.models.qwen3 import convert_weights as convert_qwen3_weights
-
+from executorch.examples.models.smollm2 import (
+    convert_weights as convert_smollm2_weights,
+)
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     DECODER_MODEL_VERSION,
 )
@@ -52,6 +54,7 @@ class Qwen2_5(HFModel):
     )
     runner_version: str = field(default=DECODER_MODEL_VERSION["qwen2_5"])
     convert_weights = convert_qwen2_5_weights
+    transform_weight = False
 
 
 @register_hf_model("qwen3_0_6b")
@@ -63,6 +66,7 @@ class Qwen3_0_6B(HFModel):
     )
     runner_version: str = field(default=DECODER_MODEL_VERSION["qwen2_5"])
     convert_weights = convert_qwen3_weights
+    transform_weight = False
 
 
 @register_hf_model("qwen3_1_7b")
@@ -74,6 +78,7 @@ class Qwen3_1_7B(HFModel):
     )
     runner_version: str = field(default=DECODER_MODEL_VERSION["qwen2_5"])
     convert_weights = convert_qwen3_weights
+    transform_weight = False
 
 
 @register_hf_model("phi_4_mini")
@@ -85,3 +90,16 @@ class Phi4Mini(HFModel):
     )
     runner_version: str = field(default=DECODER_MODEL_VERSION["phi_4_mini"])
     convert_weights = convert_phi_4_mini_weights
+    transform_weight = False
+
+
+@register_hf_model("smollm2_135m")
+@dataclass(init=False, frozen=True)
+class Smollm2_135M(HFModel):
+    repo_id: str = "HuggingFaceTB/SmolLM2-135M-Instruct"
+    params_path: str = os.path.join(
+        BASE_DIR, "../../../models/smollm2/135M_config.json"
+    )
+    runner_version: str = field(default=DECODER_MODEL_VERSION["smollm2_135m"])
+    convert_weights = convert_smollm2_weights
+    transform_weight = True
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -18,4 +18,5 @@
     "qwen3_0_6b": "qwen2_5",  # TODO: temp workaround, use special token for qwen3 in runner
     "qwen3_1_7b": "qwen2_5",
     "phi_4_mini": "phi_4_mini",
+    "smollm2_135m": "smollm2_135m",
 }
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -434,6 +434,7 @@ def compile(args, pte_filename, tokenizer):
         state_dict = torch.load(
             checkpoint, weights_only=True, map_location="cpu", mmap=True
         )
+        transform_weight = SUPPORTED_HF_MODELS[args.decoder_model].transform_weight
     else:
         state_dict = torch.load(
             args.checkpoint, weights_only=True, map_location="cpu", mmap=True
@@ -444,7 +445,9 @@ def compile(args, pte_filename, tokenizer):
 
         if args.decoder_model == "stories260k":
             state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+        transform_weight = True
 
+    if transform_weight:
         # Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
         def permute(w, heads):
             dim_0 = w.size(0)
@@ -1172,11 +1175,6 @@ def export_llama(args) -> None:
             tokenizer, TiktokenTokenizer
         ), f"Wrong tokenizer provided for llama3_2."
         runtime_tokenizer_path = args.tokenizer_model
-    elif args.decoder_model in {"qwen2_5", "qwen3_0_6b", "qwen3_1_7b"}:
-        model_id = SUPPORTED_HF_MODELS[args.decoder_model].repo_id
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]
-        tokenizer = get_tokenizer(runtime_tokenizer_path)
     elif args.decoder_model == "phi_4_mini":
         model_id = SUPPORTED_HF_MODELS[args.decoder_model].repo_id
         tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -1190,6 +1188,11 @@ def export_llama(args) -> None:
             file.seek(0)
             json.dump(data, file, indent=4)
             file.truncate()
+    elif args.decoder_model in SUPPORTED_HF_MODELS:
+        model_id = SUPPORTED_HF_MODELS[args.decoder_model].repo_id
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]
+        tokenizer = get_tokenizer(runtime_tokenizer_path)
     else:
         raise RuntimeError(f"Unknown decoder_model: {args.decoder_model}.")
 
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -10,7 +10,7 @@
  * @file
  *
  * This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B, Qwen3 0.6B
- * / 1.7B, phi4-mini-instruct with Qualcomm AI Engine Direct.
+ * / 1.7B, phi4-mini-instruct, Smollm2 135M with Qualcomm AI Engine Direct.
  *
  */
 
@@ -113,6 +113,15 @@ std::string get_formatted_prompt(
       formatted_prompt.append("<|user|>");
       formatted_prompt.append(prompt);
       formatted_prompt.append("<|end|><|assistant|>");
+    case example::DecoderModelVersion::kSmollm2_135m:
+      if (!system_prompt.empty()) {
+        formatted_prompt.append("<|im_start|>system\n");
+        formatted_prompt.append(system_prompt);
+        formatted_prompt.append("<|im_end|>\n\n");
+      }
+      formatted_prompt.append("<|im_start|>user\n");
+      formatted_prompt.append(prompt);
+      formatted_prompt.append("<|im_end|>\n\n");
       break;
     case example::DecoderModelVersion::kLlama3:
       if (!system_prompt.empty()) {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -132,6 +132,8 @@ Runner<T>::Runner(
     decoder_model_version_ = DecoderModelVersion::kQwen2_5;
   } else if (decoder_model_version == "phi_4_mini") {
     decoder_model_version_ = DecoderModelVersion::kPhi4;
+  } else if (decoder_model_version == "smollm2_135m") {
+    decoder_model_version_ = DecoderModelVersion::kSmollm2_135m;
   } else {
     ET_CHECK_MSG(false, "Unsupported Decoder Model");
   }
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -32,6 +32,7 @@ enum DecoderModelVersion {
   kLlama3,
   kQwen2_5,
   kPhi4,
+  kSmollm2_135m
 };
 
 enum KvBitWidth {

Original file line number	Diff line number	Diff line change
`@@ -18,4 +18,5 @@`
`18`	`18`	`"qwen3_0_6b": "qwen2_5", # TODO: temp workaround, use special token for qwen3 in runner`
`19`	`19`	`"qwen3_1_7b": "qwen2_5",`
`20`	`20`	`"phi_4_mini": "phi_4_mini",`
	`21`	`+ "smollm2_135m": "smollm2_135m",`
`21`	`22`	`}`
Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,8 @@ Runner<T>::Runner(`
`132`	`132`	`decoder_model_version_ = DecoderModelVersion::kQwen2_5;`
`133`	`133`	`} else if (decoder_model_version == "phi_4_mini") {`
`134`	`134`	`decoder_model_version_ = DecoderModelVersion::kPhi4;`
	`135`	`+ } else if (decoder_model_version == "smollm2_135m") {`
	`136`	`+ decoder_model_version_ = DecoderModelVersion::kSmollm2_135m;`
`135`	`137`	`} else {`
`136`	`138`	`ET_CHECK_MSG(false, "Unsupported Decoder Model");`
`137`	`139`	`}`