Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions backends/qualcomm/quantizer/custom_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
weight = node.args[1]
input_qspec_map[weight] = quantization_config.weight

if len(node.args) > 2 and isinstance(node.args[2], Node):
input_qspec_map[node.args[2]] = quantization_config.bias(node)

node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
input_qspec_map=input_qspec_map,
output_qspec=quantization_config.output_activation,
Expand Down
61 changes: 61 additions & 0 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5692,6 +5692,67 @@ def test_qnn_backend_seq_mse(self):


class TestExampleLLMScript(TestQNN):
def test_codegen2_1b(self):
if not self.required_envs():
self.skipTest("missing required envs")

prompt = "def hello_world():"
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
"--artifact",
self.artifact_dir,
"--build_folder",
self.build_folder,
"--model",
self.model,
"--ip",
self.ip,
"--port",
str(self.port),
"--prompt",
prompt,
"--temperature",
"0",
"--decoder_model",
"codegen2_1b",
"--model_mode",
"kv",
"--max_seq_len",
"128",
]
if self.compile_only:
cmds.extend(["--compile_only"])
elif self.device:
cmds.extend(["--device", self.device])
if self.host:
cmds.extend(["--host", self.host])
elif self.enable_x86_64:
cmds.extend(["--enable_x86_64"])
if self.pre_gen_pte:
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])

golden_start_with = "def hello_world():"
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
p.communicate()
msg = json.loads(conn.recv())
if "Error" in msg:
self.fail(msg["Error"])
else:
if not self.compile_only:
model_out = msg["result"][0]
self.assertTrue(
model_out.startswith(golden_start_with),
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
)
if not self.enable_x86_64:
pte_size = msg["pte_size"]
self.assertLessEqual(pte_size, 1_200_000_000) # 1200MB
if not self.compile_only and not self.enable_x86_64:
self.assertGreaterEqual(msg["inference_speed"], 60)

def test_static_gemma_2b(self):
if not self.required_envs():
self.skipTest("missing required envs")
Expand Down
16 changes: 16 additions & 0 deletions examples/models/codegen/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from executorch.examples.models.codegen.convert_weight import convert_weights
from executorch.examples.models.llama.model import Llama2Model


class CodeGenModel(Llama2Model):
def __init__(self, **kwargs):
super().__init__(**kwargs)


__all__ = [
"CodegenModel",
"convert_weights",
]
19 changes: 19 additions & 0 deletions examples/models/codegen/config/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"dim": 2048,
"ffn_dim_multiplier": 1,
"hidden_dim": 8192,
"n_heads": 16,
"n_kv_heads": 16,
"n_layers": 16,
"vocab_size": 51200,
"norm_eps": 1e-05,
"max_seq_len": 2048,
"bos_idx": 1,
"eos_idx": 2,
"model_architecture": "CodeGenModel",
"use_hf_rope": true,
"partial_rotary_factor": 0.5,
"use_ffn_norm" : false,
"norm_type": "layernorm",
"output_bias": true
}
93 changes: 93 additions & 0 deletions examples/models/codegen/convert_weight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import argparse
import os
from typing import Dict

import torch

from torchtune.models.convert_weights import get_mapped_key

# Standard _FROM_META weight mapping of Meta weights to TorchTune + additional bias weight mappings.
_HF__CODEGEN_2_FROM_META = {
"tok_embeddings.weight": "transformer.wte.weight",
"layers.{}.attention_norm.weight": "transformer.h.{}.ln_1.weight",
"layers.{}.attention_norm.bias": "transformer.h.{}.ln_1.bias",
"layers.{}.attention.wq.weight": "transformer.h.{}.attn.q_proj.weight",
"layers.{}.attention.wk.weight": "transformer.h.{}.attn.k_proj.weight",
"layers.{}.attention.wv.weight": "transformer.h.{}.attn.v_proj.weight",
"layers.{}.attention.wo.weight": "transformer.h.{}.attn.out_proj.weight",
"layers.{}.feed_forward.fc_in.weight": "transformer.h.{}.mlp.fc_in.weight",
"layers.{}.feed_forward.fc_in.bias": "transformer.h.{}.mlp.fc_in.bias",
"layers.{}.feed_forward.fc_out.weight": "transformer.h.{}.mlp.fc_out.weight",
"layers.{}.feed_forward.fc_out.bias": "transformer.h.{}.mlp.fc_out.bias",
"norm.weight": "transformer.ln_f.weight",
"norm.bias": "transformer.ln_f.bias",
"output.weight": "lm_head.weight",
"output.bias": "lm_head.bias",
}


def codegen_hf_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
converted_state_dict = {}
keys_to_remove = []
for key in state_dict:
if ".attn.causal_mask" in key:
keys_to_remove.append(key)
for key in keys_to_remove:
state_dict.pop(key)
inverted_mapping_dict = {v: k for k, v in _HF__CODEGEN_2_FROM_META.items()}
for key, value in state_dict.items():
if key.endswith("attn.qkv_proj.weight"):
mp_num = 8 # This number is from modeling_codegen.py
dim, dim_kv = value.shape
block = dim // mp_num
split_size = block // 3

qkv_blocks = value.reshape(mp_num, block, dim_kv)
q_blocks = qkv_blocks[:, 0:split_size, :]
v_blocks = qkv_blocks[:, split_size : 2 * split_size, :]
k_blocks = qkv_blocks[:, 2 * split_size : 3 * split_size, :]

q = q_blocks.reshape(-1, dim_kv)
v = v_blocks.reshape(-1, dim_kv)
k = k_blocks.reshape(-1, dim_kv)

for new_key, new_value in [("q_proj", q), ("k_proj", k), ("v_proj", v)]:
new_key = key.replace("qkv_proj", new_key)
new_key = get_mapped_key(new_key, inverted_mapping_dict)
converted_state_dict[new_key] = new_value
else:
mapped_key = get_mapped_key(key, inverted_mapping_dict)
converted_state_dict[mapped_key] = value

return converted_state_dict


def convert_weights(input_dir_or_checkpoint: str, output_file: str) -> None:
pt_path = os.path.join(input_dir_or_checkpoint, "pytorch_model.bin")
print("Loading checkpoint from file...")
sd = torch.load(pt_path, map_location="cpu")
print("Converting checkpoint...")
sd = codegen_hf_to_meta(sd)

print("Saving checkpoint...")
torch.save(sd, output_file)
print("Done.")


def main():
parser = argparse.ArgumentParser(
description="Convert Codegen weights to Meta format."
)
parser.add_argument(
"input_dir",
type=str,
help="Path to directory containing checkpoint files, or path to a single checkpoint file.",
)
parser.add_argument("output", type=str, help="Path to the output checkpoint")

args = parser.parse_args()
convert_weights(args.input_dir, args.output)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions examples/models/llama/model_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,17 @@ class ModelArgs:
head_dim: Optional[int] = None # Optional customized head_dim
multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
ffn_dim_multiplier: Optional[float] = None
model_architecture: str = (
"LlamaForCausalLM" # This setting is currently only supported for the QNN backend
)
norm_eps: float = 1e-5
post_attention_norm: bool = False
post_ffn_norm: bool = False
max_batch_size: int = 1
max_seq_len: int = 2048
max_context_len: int = 2048
use_ffn_norm: bool = True
output_bias: bool = False
moe: bool = False # True to enable the MoE (Mixture of Experts)
num_experts: int = 8 # Number of experts
num_activated_experts: int = 2 # Number of experts to activate
Expand Down
22 changes: 14 additions & 8 deletions examples/qualcomm/oss_scripts/llama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ This file provides you the instructions to run LLM Decoder model with different
1. LLAMA2 Stories 110M
2. LLAMA3.2 1B
3. LLAMA3.2 3B
4. Gemma 2B
5. Gemma3 1B
6. Phi4-mini-instruct
7. QWEN2.5 0.5B / 1.5B
8. QWEN3 0.6B / 1.7B
9. SmolLM2 135M
10. SmolLM3 3B
4. Codegen2 1B
5. Gemma 2B
6. Gemma3 1B
7. Phi4-mini-instruct
8. QWEN2.5 0.5B / 1.5B
9. QWEN3 0.6B / 1.7B
10. SmolLM2 135M
11. SmolLM3 3B


We offer the following modes to execute the model:
Expand Down Expand Up @@ -80,6 +81,12 @@ Default example using kv mode.
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
```

#### Codegen2
Default example using kv mode.
```bash
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model codegen2_1b --model_mode kv --max_seq_len 1024 --prompt "def hello_world():"
```

#### Gemma 2B
Default example using hybrid mode
```bash
Expand Down Expand Up @@ -135,7 +142,6 @@ Default example using kv mode.
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm3-3b --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
```


### KV Cache update mechanism
We have two distinct mechanisms for updating the key-value (KV) cache, which can be selected at runtime. Shift Pointer and Smart Mask.

Expand Down
25 changes: 25 additions & 0 deletions examples/qualcomm/oss_scripts/llama/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
get_ptq_per_channel_quant_config,
)
from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
from executorch.examples.models.codegen import (
convert_weights as convert_codegen_weights,
)

from executorch.examples.models.gemma import convert_weights as convert_gemma_weights
from executorch.examples.models.gemma3 import convert_weights as convert_gemma3_weights
Expand Down Expand Up @@ -331,6 +334,28 @@ class Gemma_2B(LLMModelConfig):
)


@register_llm_model("codegen2_1b")
@dataclass(init=False, frozen=True)
class Codegen(LLMModelConfig):
repo_id: str = "Salesforce/codegen2-1B_P"
params_path: str = os.path.join(
BASE_DIR, "../../../models/codegen/config/config.json"
)
convert_weights = convert_codegen_weights
transform_weight = True
instruct_model = False
num_sharding = 1
# quant config
ptq = QuantDtype.use_16a8w
group_size = None
masked_softmax = True
seq_mse_candidates = 0
r1 = False
r2 = False
r3 = False
custom_annotation = ()


@register_llm_model("gemma3-1b")
@dataclass(init=False, frozen=True)
class Gemma3(LLMModelConfig):
Expand Down
1 change: 1 addition & 0 deletions examples/qualcomm/oss_scripts/llama/decoder_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@
"qwen3-1_7b": "qwen3",
"smollm2_135m": "smollm2_135m",
"smollm3-3b": "smollm3",
"codegen2_1b": "codegen",
}
28 changes: 19 additions & 9 deletions examples/qualcomm/oss_scripts/llama/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,6 @@ def compile(
kv_config.use_kv_cache = True
kv_config.enable_r3 = decoder_model_config.r3
kv_config.kv_io_bit_width = decoder_model_config.get_kv_io_bit_width()

if decoder_model_config.masked_softmax:
if is_qnn_sdk_version_less_than("2.35"):
logging.warning(
Expand Down Expand Up @@ -561,25 +560,30 @@ def compile(

if decoder_model_config.transform_weight:
# Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
def permute(w, heads):
def permute(w, heads, partial_rotary_dim):
dim_0 = w.size(0)
dim_1 = w.size(1)
return (
w.view(heads, dim_0 // heads // 2, 2, dim_1)
.transpose(1, 2)
transformed_weight = (
w.view(heads, -1, dim_0 // heads // 2 // partial_rotary_dim, 2, dim_1)
.transpose(2, 3)
.reshape(dim_0, dim_1)
)
return transformed_weight

n_heads = llama_instance_list[0].n_heads
n_kv_heads = llama_instance_list[0].n_kv_heads
n_layers = llama_instance_list[0].n_layers

partial_rotary_dim = int(1 // kv_config.partial_rotary_factor) # TODO Handle cases where input size isn't divisible.
for layer_i in range(n_layers):
state_dict[f"layers.{layer_i}.attention.wq.weight"] = permute(
state_dict[f"layers.{layer_i}.attention.wq.weight"], n_heads
state_dict[f"layers.{layer_i}.attention.wq.weight"],
n_heads,
partial_rotary_dim,
)
state_dict[f"layers.{layer_i}.attention.wk.weight"] = permute(
state_dict[f"layers.{layer_i}.attention.wk.weight"], n_kv_heads
state_dict[f"layers.{layer_i}.attention.wk.weight"],
n_kv_heads,
partial_rotary_dim,
)

for llama_instance in llama_instance_list:
Expand Down Expand Up @@ -648,6 +652,7 @@ def permute(w, heads):
for layer in llama_instance.layers:
if getattr(layer.attention, "prepare_sha", None):
layer.attention.prepare_sha()

if getattr(layer.feed_forward, "prepare_feedfoward_conv", None):
layer.feed_forward.prepare_feedfoward_conv()

Expand Down Expand Up @@ -1299,8 +1304,13 @@ def export_llama(args) -> None:
runtime_tokenizer_path = tokenizer_artifacts[-1]
tokenizer = get_tokenizer(runtime_tokenizer_path, tokenizer_config)

if args.decoder_model == "codegen2_1b":
# Override the default BOS and EOS token IDs for codegen2_1b
tokenizer.bos_id = 1
tokenizer.eos_id = 2

# TODO: Remove this once error is resolved.
if args.decoder_model == "phi_4_mini":
elif args.decoder_model == "phi_4_mini":
with open(runtime_tokenizer_path, "r+") as file:
data = json.load(file)
# TODO: Encountered the following error during runtime, so switched behavior for now.
Expand Down
Loading
Loading