Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/qualcomm/oss_scripts/llama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ This file provides you the instructions to run LLM Decoder model with different
2. LLAMA3.2 1B
3. LLAMA3.2 3B
4. QWEN2.5 0.5B
5. QWEN3 0.6B / 1.7B
6. Phi4-mini-instruct

We offer the following modes to execute the model:

Expand Down
14 changes: 14 additions & 0 deletions examples/qualcomm/oss_scripts/llama/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from dataclasses import dataclass, field
from typing import Callable, Dict, Type

from executorch.examples.models.phi_4_mini import (
convert_weights as convert_phi_4_mini_weights,
)
from executorch.examples.models.qwen2_5 import (
convert_weights as convert_qwen2_5_weights,
)
Expand Down Expand Up @@ -71,3 +74,14 @@ class Qwen3_1_7B(HFModel):
)
runner_version: str = field(default=DECODER_MODEL_VERSION["qwen2_5"])
convert_weights = convert_qwen3_weights


@register_hf_model("phi_4_mini")
@dataclass(init=False, frozen=True)
class Phi4Mini(HFModel):
repo_id: str = "microsoft/Phi-4-mini-instruct"
params_path: str = os.path.join(
BASE_DIR, "../../../models/phi_4_mini/config/config.json"
)
runner_version: str = field(default=DECODER_MODEL_VERSION["phi_4_mini"])
convert_weights = convert_phi_4_mini_weights
1 change: 1 addition & 0 deletions examples/qualcomm/oss_scripts/llama/decoder_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
"stories110m": "llama2",
"llama3_2": "llama3",
"qwen2_5": "qwen2_5",
"phi_4_mini": "phi_4_mini",
}
11 changes: 8 additions & 3 deletions examples/qualcomm/oss_scripts/llama/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ def permute(w, heads):
annotate_conv=args.ptq != "16a8w",
),
)
if args.decoder_model == {"stories110m", "stories260k"}:
if args.decoder_model in {"stories110m", "stories260k"}:
custom_annotations = custom_annotations + (
annotate_linear_16a8w_in_affine_layer,
)
Expand Down Expand Up @@ -1175,11 +1175,16 @@ def export_llama(args) -> None:
tokenizer = AutoTokenizer.from_pretrained(model_id)
runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]
tokenizer = get_tokenizer(runtime_tokenizer_path)
elif args.decoder_model == "phi_4_mini":
model_id = SUPPORTED_HF_MODELS[args.decoder_model].repo_id
tokenizer = AutoTokenizer.from_pretrained(model_id)
runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]
tokenizer = get_tokenizer(runtime_tokenizer_path)
with open(runtime_tokenizer_path, "r+") as file:
data = json.load(file)
# TODO: Encountered the following error during runtime, so switched behavior for now.
# Error: libc++abi: terminating due to uncaught exception of type std::runtime_error: Unsupported Normalizer type: NFC.
data.pop("normalizer")
# Error: libc++abi: terminating due to uncaught exception of type std::runtime_error: invert=true is not supported for Split PreTokenizer. Only invert=false is supported.
data["pre_tokenizer"]["pretokenizers"][-2]["invert"] = False
file.seek(0)
json.dump(data, file, indent=4)
file.truncate()
Expand Down
38 changes: 31 additions & 7 deletions examples/qualcomm/oss_scripts/llama/model/static_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,24 @@ def apply_rotary_emb_single(
return x_out


def apply_partial_rotary_emb_single(
x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
) -> torch.Tensor:

if x.dim() == 4:
freqs_cos = freqs_cos[None, :, None, :]
freqs_sin = freqs_sin[None, :, None, :]

rotary_dim = freqs_cos.shape[-1] * 2

x_rot, x_pass = x[..., :rotary_dim], x[..., rotary_dim:]
x_r, x_i = x_rot[..., : x_rot.shape[-1] // 2], x_rot[..., x_rot.shape[-1] // 2 :]
x_out_r = x_r * freqs_cos - x_i * freqs_sin
x_out_i = x_r * freqs_sin + x_i * freqs_cos
x_rotated = torch.cat([x_out_r, x_out_i], dim=-1)
return torch.cat([x_rotated, x_pass], dim=-1)


class LlamaAttention(nn.Module):
def __init__(self, config: ModelArgs, output_new_cache_only=False):
super().__init__()
Expand All @@ -60,6 +78,11 @@ def __init__(self, config: ModelArgs, output_new_cache_only=False):
self.q_norm_fn = torch.nn.RMSNorm(q_norm_dim, eps=config.norm_eps)
self.k_norm_fn = torch.nn.RMSNorm(k_norm_dim, eps=config.norm_eps)

if config.partial_rotary_factor < 1:
self.apply_rope_emb = apply_partial_rotary_emb_single
else:
self.apply_rope_emb = apply_rotary_emb_single
Comment on lines +81 to +84
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @cccclai

Can we refer to the config in this folder #13086 instead of having phi specific logic inside static llama?

I've updated the condition here; static llama only depends on config file now.


self.wq = nn.Linear(
self.dim,
self.n_heads * self.head_dim,
Expand Down Expand Up @@ -199,17 +222,17 @@ def forward_sha( # noqa: C901
for i in range(len(q)):
if self.use_qk_norm and self.qk_norm_before_rope:
q[i] = self.q_norm_fn(q[i])
q[i] = apply_rotary_emb_single(q[i], freqs_cos, freqs_sin)
q[i] = self.apply_rope_emb(q[i], freqs_cos, freqs_sin)
if hasattr(self.config, "enable_r3") and self.config.enable_r3:
q[i] = torch.matmul(q[i], self.r3_weight.T)
q[i] = torch.matmul(q[i], self.r3_weight)
if self.use_qk_norm and not self.qk_norm_before_rope:
q[i] = self.q_norm_fn(q[i])
for i in range(len(k)):
if self.use_qk_norm and self.qk_norm_before_rope:
k[i] = self.k_norm_fn(k[i])
k[i] = apply_rotary_emb_single(k[i], freqs_cos, freqs_sin).transpose(1, 2)
k[i] = self.apply_rope_emb(k[i], freqs_cos, freqs_sin).transpose(1, 2)
if hasattr(self.config, "enable_r3") and self.config.enable_r3:
k[i] = torch.matmul(k[i], self.r3_weight.T)
k[i] = torch.matmul(k[i], self.r3_weight)
if self.use_qk_norm and not self.qk_norm_before_rope:
k[i] = self.k_norm_fn(k[i])

Expand Down Expand Up @@ -272,8 +295,8 @@ def forward(
q = self.q_norm_fn(q)
k = self.k_norm_fn(k)

q = apply_rotary_emb_single(q, freqs_cos, freqs_sin)
k = apply_rotary_emb_single(k, freqs_cos, freqs_sin).permute(0, 2, 3, 1)
q = self.apply_rope_emb(q, freqs_cos, freqs_sin)
k = self.apply_rope_emb(k, freqs_cos, freqs_sin).permute(0, 2, 3, 1)

if self.use_qk_norm and not self.qk_norm_before_rope:
q = self.q_norm_fn(q)
Expand Down Expand Up @@ -368,7 +391,8 @@ def __init__(self, config: ModelArgs, output_new_cache_only=False):
super().__init__()
self.dim = config.dim
self.attention = LlamaAttention(
config=config, output_new_cache_only=output_new_cache_only
config=config,
output_new_cache_only=output_new_cache_only,
)
self.feed_forward = FeedForward(config)
self.attention_norm = torch.nn.RMSNorm(config.dim, eps=config.norm_eps)
Expand Down
14 changes: 12 additions & 2 deletions examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
/**
* @file
*
* This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B with Qualcomm
* AI Engine Direct.
* This tool can run Llama2 110M, Llama3.2 1B / 3B, Qwen2.5 0.5B, Qwen3 0.6B
* / 1.7B phi4-mini-instruct with Qualcomm AI Engine Direct.
*
*/

Expand Down Expand Up @@ -104,6 +104,16 @@ std::string get_formatted_prompt(
case example::DecoderModelVersion::kQwen2_5:
formatted_prompt.append(prompt);
break;
case example::DecoderModelVersion::kPhi4:
if (!system_prompt.empty()) {
formatted_prompt.append("<|system|>");
formatted_prompt.append(system_prompt);
formatted_prompt.append("<|end|>");
}
formatted_prompt.append("<|user|>");
formatted_prompt.append(prompt);
formatted_prompt.append("<|end|><|assistant|>");
break;
case example::DecoderModelVersion::kLlama3:
if (!system_prompt.empty()) {
formatted_prompt.append(
Expand Down
4 changes: 4 additions & 0 deletions examples/qualcomm/oss_scripts/llama/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ Runner::Runner(
decoder_model_version_ = DecoderModelVersion::kLlama3;
} else if (decoder_model_version == "qwen2_5") {
decoder_model_version_ = DecoderModelVersion::kQwen2_5;
} else if (decoder_model_version == "phi_4_mini") {
decoder_model_version_ = DecoderModelVersion::kPhi4;
} else {
ET_CHECK_MSG(false, "Unsupported Decoder Model");
}
Expand Down Expand Up @@ -185,6 +187,8 @@ Error Runner::load() {
}
if (decoder_model_version_ == DecoderModelVersion::kLlama3) {
eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
} else if (decoder_model_version_ == DecoderModelVersion::kPhi4) {
eos_ids->insert(tokenizer_->encode("<|end|>", 0, 0).get()[0]);
}
// Try avoid getMetadataHelper as it is time consuming.
Result<MethodMeta> method_meta =
Expand Down
1 change: 1 addition & 0 deletions examples/qualcomm/oss_scripts/llama/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ enum DecoderModelVersion {
kLlama2 = 0,
kLlama3,
kQwen2_5,
kPhi4,
};
class Runner {
public:
Expand Down
Loading