pytorch · thchenqti · Sep 3, 2025
@@ -138,6 +138,9 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
         weight = node.args[1]
         input_qspec_map[weight] = quantization_config.weight
 
+        if len(node.args) > 2 and isinstance(node.args[2], Node):
+            input_qspec_map[node.args[2]] = quantization_config.bias(node)
+
         node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
             input_qspec_map=input_qspec_map,
             output_qspec=quantization_config.output_activation,

@@ -5692,6 +5692,67 @@ def test_qnn_backend_seq_mse(self):
 
 
 class TestExampleLLMScript(TestQNN):
+    def test_codegen2_1b(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "def hello_world():"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            prompt,
+            "--temperature",
+            "0",
+            "--decoder_model",
+            "codegen2_1b",
+            "--model_mode",
+            "kv",
+            "--max_seq_len",
+            "128",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        golden_start_with = "def hello_world():"
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    self.assertTrue(
+                        model_out.startswith(golden_start_with),
+                        f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
+                    )
+                if not self.enable_x86_64:
+                    pte_size = msg["pte_size"]
+                    self.assertLessEqual(pte_size, 1_200_000_000)  # 1200MB
+                if not self.compile_only and not self.enable_x86_64:
+                    self.assertGreaterEqual(msg["inference_speed"], 60)
+
     def test_static_gemma_2b(self):
         if not self.required_envs():
             self.skipTest("missing required envs")

@@ -0,0 +1,16 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.codegen.convert_weight import convert_weights
+from executorch.examples.models.llama.model import Llama2Model
+
+
+class CodeGenModel(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "CodegenModel",
+    "convert_weights",
+]
@@ -0,0 +1,19 @@
+{
+  "dim": 2048,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 8192,
+  "n_heads": 16,
+  "n_kv_heads": 16,
+  "n_layers": 16,
+  "vocab_size": 51200,
+  "norm_eps": 1e-05,
+  "max_seq_len": 2048,
+  "bos_idx": 1,
+  "eos_idx": 2,
+  "model_architecture": "CodeGenModel",
+  "use_hf_rope": true,
+  "partial_rotary_factor": 0.5,
+  "use_ffn_norm" : false,
+  "norm_type": "layernorm",
+  "output_bias": true
+}
@@ -0,0 +1,93 @@
+import argparse
+import os
+from typing import Dict
+
+import torch
+
+from torchtune.models.convert_weights import get_mapped_key
+
+# Standard _FROM_META weight mapping of Meta weights to TorchTune + additional bias weight mappings.
+_HF__CODEGEN_2_FROM_META = {
+    "tok_embeddings.weight": "transformer.wte.weight",
+    "layers.{}.attention_norm.weight": "transformer.h.{}.ln_1.weight",
+    "layers.{}.attention_norm.bias": "transformer.h.{}.ln_1.bias",
+    "layers.{}.attention.wq.weight": "transformer.h.{}.attn.q_proj.weight",
+    "layers.{}.attention.wk.weight": "transformer.h.{}.attn.k_proj.weight",
+    "layers.{}.attention.wv.weight": "transformer.h.{}.attn.v_proj.weight",
+    "layers.{}.attention.wo.weight": "transformer.h.{}.attn.out_proj.weight",
+    "layers.{}.feed_forward.fc_in.weight": "transformer.h.{}.mlp.fc_in.weight",
+    "layers.{}.feed_forward.fc_in.bias": "transformer.h.{}.mlp.fc_in.bias",
+    "layers.{}.feed_forward.fc_out.weight": "transformer.h.{}.mlp.fc_out.weight",
+    "layers.{}.feed_forward.fc_out.bias": "transformer.h.{}.mlp.fc_out.bias",
+    "norm.weight": "transformer.ln_f.weight",
+    "norm.bias": "transformer.ln_f.bias",
+    "output.weight": "lm_head.weight",
+    "output.bias": "lm_head.bias",
+}
+
+
+def codegen_hf_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    converted_state_dict = {}
+    keys_to_remove = []
+    for key in state_dict:
+        if ".attn.causal_mask" in key:
+            keys_to_remove.append(key)
+    for key in keys_to_remove:
+        state_dict.pop(key)
+    inverted_mapping_dict = {v: k for k, v in _HF__CODEGEN_2_FROM_META.items()}
+    for key, value in state_dict.items():
+        if key.endswith("attn.qkv_proj.weight"):
+            mp_num = 8  # This number is from modeling_codegen.py
+            dim, dim_kv = value.shape
+            block = dim // mp_num
+            split_size = block // 3
+
+            qkv_blocks = value.reshape(mp_num, block, dim_kv)
+            q_blocks = qkv_blocks[:, 0:split_size, :]
+            v_blocks = qkv_blocks[:, split_size : 2 * split_size, :]
+            k_blocks = qkv_blocks[:, 2 * split_size : 3 * split_size, :]
+
+            q = q_blocks.reshape(-1, dim_kv)
+            v = v_blocks.reshape(-1, dim_kv)
+            k = k_blocks.reshape(-1, dim_kv)
+
+            for new_key, new_value in [("q_proj", q), ("k_proj", k), ("v_proj", v)]:
+                new_key = key.replace("qkv_proj", new_key)
+                new_key = get_mapped_key(new_key, inverted_mapping_dict)
+                converted_state_dict[new_key] = new_value
+        else:
+            mapped_key = get_mapped_key(key, inverted_mapping_dict)
+            converted_state_dict[mapped_key] = value
+
+    return converted_state_dict
+
+
+def convert_weights(input_dir_or_checkpoint: str, output_file: str) -> None:
+    pt_path = os.path.join(input_dir_or_checkpoint, "pytorch_model.bin")
+    print("Loading checkpoint from file...")
+    sd = torch.load(pt_path, map_location="cpu")
+    print("Converting checkpoint...")
+    sd = codegen_hf_to_meta(sd)
+
+    print("Saving checkpoint...")
+    torch.save(sd, output_file)
+    print("Done.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Codegen weights to Meta format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing checkpoint files, or path to a single checkpoint file.",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+    convert_weights(args.input_dir, args.output)
+
+
+if __name__ == "__main__":
+    main()
@@ -46,12 +46,17 @@ class ModelArgs:
     head_dim: Optional[int] = None  # Optional customized head_dim
     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
     ffn_dim_multiplier: Optional[float] = None
+    model_architecture: str = (
+        "LlamaForCausalLM"  # This setting is currently only supported for the QNN backend
+    )
     norm_eps: float = 1e-5
     post_attention_norm: bool = False
     post_ffn_norm: bool = False
     max_batch_size: int = 1
     max_seq_len: int = 2048
     max_context_len: int = 2048
+    use_ffn_norm: bool = True
+    output_bias: bool = False
     moe: bool = False  # True to enable the MoE (Mixture of Experts)
     num_experts: int = 8  # Number of experts
     num_activated_experts: int = 2  # Number of experts to activate

@@ -5,13 +5,14 @@ This file provides you the instructions to run LLM Decoder model with different
  1. LLAMA2 Stories 110M
  2. LLAMA3.2 1B
  3. LLAMA3.2 3B
- 4. Gemma 2B
- 5. Gemma3 1B
- 6. Phi4-mini-instruct
- 7. QWEN2.5 0.5B / 1.5B
- 8. QWEN3 0.6B / 1.7B
- 9. SmolLM2 135M
- 10. SmolLM3 3B
+ 4. Codegen2 1B
+ 5. Gemma 2B
+ 6. Gemma3 1B
+ 7. Phi4-mini-instruct
+ 8. QWEN2.5 0.5B / 1.5B
+ 9. QWEN3 0.6B / 1.7B
+ 10. SmolLM2 135M
+ 11. SmolLM3 3B
 
 
 We offer the following modes to execute the model:
@@ -80,6 +81,12 @@ Default example using kv mode.
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
 ```
 
+#### Codegen2
+Default example using kv mode.
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():" 
+```
+
 #### Gemma 2B
 Default example using hybrid mode
 ```bash
@@ -135,7 +142,6 @@ Default example using kv mode.
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm3-3b --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
 ```
 
-
 ### KV Cache update mechanism
 We have two distinct mechanisms for updating the key-value (KV) cache, which can be selected at runtime. Shift Pointer and Smart Mask.
 

@@ -23,6 +23,9 @@
     get_ptq_per_channel_quant_config,
 )
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.examples.models.codegen import (
+    convert_weights as convert_codegen_weights,
+)
 
 from executorch.examples.models.gemma import convert_weights as convert_gemma_weights
 from executorch.examples.models.gemma3 import convert_weights as convert_gemma3_weights
@@ -331,6 +334,28 @@ class Gemma_2B(LLMModelConfig):
     )
 
 
+@register_llm_model("codegen2_1b")
+@dataclass(init=False, frozen=True)
+class Codegen(LLMModelConfig):
+    repo_id: str = "Salesforce/codegen2-1B_P"
+    params_path: str = os.path.join(
+        BASE_DIR, "../../../models/codegen/config/config.json"
+    )
+    convert_weights = convert_codegen_weights
+    transform_weight = True
+    instruct_model = False
+    num_sharding = 1
+    # quant config
+    ptq = QuantDtype.use_16a8w
+    group_size = None
+    masked_softmax = True
+    seq_mse_candidates = 0
+    r1 = False
+    r2 = False
+    r3 = False
+    custom_annotation = ()
+
+
 @register_llm_model("gemma3-1b")
 @dataclass(init=False, frozen=True)
 class Gemma3(LLMModelConfig):

@@ -25,4 +25,5 @@
     "qwen3-1_7b": "qwen3",
     "smollm2_135m": "smollm2_135m",
     "smollm3-3b": "smollm3",
+    "codegen2_1b": "codegen",
 }
@@ -445,7 +445,6 @@ def compile(
     kv_config.use_kv_cache = True
     kv_config.enable_r3 = decoder_model_config.r3
     kv_config.kv_io_bit_width = decoder_model_config.get_kv_io_bit_width()
-
     if decoder_model_config.masked_softmax:
         if is_qnn_sdk_version_less_than("2.35"):
             logging.warning(
@@ -561,25 +560,30 @@ def compile(
 
     if decoder_model_config.transform_weight:
         # Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
-        def permute(w, heads):
+        def permute(w, heads, partial_rotary_dim):
             dim_0 = w.size(0)
             dim_1 = w.size(1)
-            return (
-                w.view(heads, dim_0 // heads // 2, 2, dim_1)
-                .transpose(1, 2)
+            transformed_weight = (
+                w.view(heads, -1, dim_0 // heads // 2 // partial_rotary_dim, 2, dim_1)
+                .transpose(2, 3)
                 .reshape(dim_0, dim_1)
             )
+            return transformed_weight
 
         n_heads = llama_instance_list[0].n_heads
         n_kv_heads = llama_instance_list[0].n_kv_heads
         n_layers = llama_instance_list[0].n_layers
-
+        partial_rotary_dim = int(1 // kv_config.partial_rotary_factor) # TODO Handle cases where input size isn't divisible.
         for layer_i in range(n_layers):
             state_dict[f"layers.{layer_i}.attention.wq.weight"] = permute(
-                state_dict[f"layers.{layer_i}.attention.wq.weight"], n_heads
+                state_dict[f"layers.{layer_i}.attention.wq.weight"], 
+                n_heads,
+                partial_rotary_dim,
             )
             state_dict[f"layers.{layer_i}.attention.wk.weight"] = permute(
-                state_dict[f"layers.{layer_i}.attention.wk.weight"], n_kv_heads
+                state_dict[f"layers.{layer_i}.attention.wk.weight"],
+                n_kv_heads,
+                partial_rotary_dim,
             )
 
     for llama_instance in llama_instance_list:
@@ -648,6 +652,7 @@ def permute(w, heads):
         for layer in llama_instance.layers:
             if getattr(layer.attention, "prepare_sha", None):
                 layer.attention.prepare_sha()
+
             if getattr(layer.feed_forward, "prepare_feedfoward_conv", None):
                 layer.feed_forward.prepare_feedfoward_conv()
 
@@ -1299,8 +1304,13 @@ def export_llama(args) -> None:
             runtime_tokenizer_path = tokenizer_artifacts[-1]
         tokenizer = get_tokenizer(runtime_tokenizer_path, tokenizer_config)
 
+    if args.decoder_model == "codegen2_1b":
+        # Override the default BOS and EOS token IDs for codegen2_1b
+        tokenizer.bos_id = 1
+        tokenizer.eos_id = 2
+
     # TODO: Remove this once error is resolved.
-    if args.decoder_model == "phi_4_mini":
+    elif args.decoder_model == "phi_4_mini":
         with open(runtime_tokenizer_path, "r+") as file:
             data = json.load(file)
             # TODO: Encountered the following error during runtime, so switched behavior for now.