From fc07cc20b4cd835b9a70bb083c48b4b06e82b999 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Mon, 10 Feb 2025 16:58:17 -0800
Subject: [PATCH 01/12] Add qwen 2.5

---
 examples/models/llama/attention.py         |  7 ++-
 examples/models/llama/model.py             |  3 +-
 examples/models/llama/rope.py              | 53 ++++++++++++++--
 examples/models/qwen2_5/convert_weights.py | 73 ++++++++++++++++++++++
 4 files changed, 128 insertions(+), 8 deletions(-)
 create mode 100644 examples/models/qwen2_5/convert_weights.py

diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
index 91168a388d3..3d127e47f3d 100644
--- a/examples/models/llama/attention.py
+++ b/examples/models/llama/attention.py
@@ -175,9 +175,10 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         self.max_batch_size = args.max_batch_size
         self.max_context_len = args.max_context_len
         self.dim = args.dim
-        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
-        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        # TODO: parametrize bias for attention and feedforward.
+        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=True)
+        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=True)
+        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=True)
         self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
 
         self.layer_id = layer_id
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
index 90582af4856..ac9d30c7e1b 100644
--- a/examples/models/llama/model.py
+++ b/examples/models/llama/model.py
@@ -150,6 +150,7 @@ def __init__(self, **kwargs):
             input_prune_map=input_prune_map,
             output_prune_map=output_prune_map,
             enable_dynamic_shape=self.enable_dynamic_shape,
+            use_hf_rope=True,
             **params,
         )
 
@@ -170,7 +171,7 @@ def __init__(self, **kwargs):
 
         # Within the device="meta" context, tensors that are created do not carry data.
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
-        with torch.device("meta"):
+        with torch.device("cpu"):
             self.model_ = Transformer(model_args)
 
         if "int8" in str(checkpoint_path):
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 01352f404df..caca5907a0c 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -114,6 +114,7 @@ def apply_rotary_emb_to_k(
     return xk_out.type_as(xk)
 
 
+# Wrap apply_rotary_emb in a module to enable it to be module swapped out.
 class RotaryEmbedding(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -209,18 +210,66 @@ def hf_apply_rotary_emb_to_k(k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return k_embed
 
 
+# ======================= Qwen2 Implementation ========================
+
+
+def qwen_precompute_freqs_cis(dim: int, end: int, theta: float = 1_000_000.0):
+    """
+    Precompute frequency tensor for Qwen2-style RoPE.
+    """
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2, device="cpu")[: (dim // 2)].float() / dim)
+    )
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cos = torch.cos(freqs)
+    freqs_sin = torch.sin(freqs)
+    return freqs_cos, freqs_sin
+
+
+def qwen_apply_rotary_emb(
+    q: torch.Tensor, k: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply Qwen2-style RoPE to query and key tensors.
+    """
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    # Reshape cos and sin for broadcasting
+    cos = freqs_cos.unsqueeze(1)  # [seq_len, 1, head_dim]
+    sin = freqs_sin.unsqueeze(1)  # [seq_len, 1, head_dim]
+
+    # Apply rotation
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
 class Rope(torch.nn.Module):
     def __init__(self, params: ModelArgs):
         super().__init__()
         self.params = params
+
+        # Choose the appropriate RoPE implementation
         if self.params.use_hf_rope:
             self.precompute_freqs_cis = hf_precompute_freqs_cis
+            self.apply_rotary_emb = hf_apply_rotary_emb
+        # elif self.params.use_qwen_rope:
+        #     self.precompute_freqs_cis = qwen_precompute_freqs_cis
+        #     self.apply_rotary_emb = qwen_apply_rotary_emb
         else:
             self.precompute_freqs_cis = partial(
                 precompute_freqs_cis,
                 use_scaled=self.params.use_scaled_rope,
                 scale_factor=self.params.rope_scale_factor,
             )
+            self.apply_rotary_emb = RotaryEmbedding()
+
+        # Precompute frequencies
         freqs_cos, freqs_sin = self.precompute_freqs_cis(
             self.params.head_dim,
             (
@@ -232,10 +281,6 @@ def __init__(self, params: ModelArgs):
         )
         self.register_buffer("freqs_cos", freqs_cos, persistent=False)
         self.register_buffer("freqs_sin", freqs_sin, persistent=False)
-        if self.params.use_hf_rope:
-            self.apply_rotary_emb = hf_apply_rotary_emb
-        else:
-            self.apply_rotary_emb = RotaryEmbedding()
 
     def forward(
         self,
diff --git a/examples/models/qwen2_5/convert_weights.py b/examples/models/qwen2_5/convert_weights.py
new file mode 100644
index 00000000000..4341d260831
--- /dev/null
+++ b/examples/models/qwen2_5/convert_weights.py
@@ -0,0 +1,73 @@
+from typing import Dict
+
+from torchtune.training import FullModelHFCheckpointer
+# from torchtune.models import convert_weights
+from torchtune.models.convert_weights import get_mapped_key
+import torch
+
+# Standard _FROM_META weight mapping of Meta weights to TorchTune + additional bias weight mappings.
+_QWEN_2_FROM_META = {
+    "tok_embeddings.weight": "tok_embeddings.weight",
+    "norm.weight": "norm.scale",
+    "output.weight": "output.weight",
+    "layers.{}.attention.wk.weight": "layers.{}.attn.k_proj.weight",
+    "layers.{}.attention.wk.bias": "layers.{}.attn.k_proj.bias",
+    "layers.{}.attention.wq.weight": "layers.{}.attn.q_proj.weight",
+    "layers.{}.attention.wq.bias": "layers.{}.attn.q_proj.bias",
+    "layers.{}.attention.wv.weight": "layers.{}.attn.v_proj.weight",
+    "layers.{}.attention.wv.bias": "layers.{}.attn.v_proj.bias",
+    "layers.{}.attention.wo.weight": "layers.{}.attn.output_proj.weight",
+    "layers.{}.attention_norm.weight": "layers.{}.sa_norm.scale",
+    "layers.{}.ffn_norm.weight": "layers.{}.mlp_norm.scale",
+    "layers.{}.feed_forward.w1.weight": "layers.{}.mlp.w1.weight",
+    "layers.{}.feed_forward.w2.weight": "layers.{}.mlp.w2.weight",
+    "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
+    
+}
+
+def qwen_2_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from torchtune's format to Meta's format. This function
+    doesn't handle any sharding or splitting of state dicts. It follows the
+    state_dict IN -> state_dict OUT pattern.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _QWEN_2_FROM_META.items()}
+
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, inverted_mapping_dict)
+        converted_state_dict[new_key] = value
+
+    return converted_state_dict
+
+# TODO: no need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
+checkpointer = FullModelHFCheckpointer(
+    checkpoint_dir='/home/jackzhxng/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/',
+    checkpoint_files=['model.safetensors'],
+    output_dir='.' ,
+    model_type='QWEN2'
+)
+
+print("Loading checkpoint")
+sd = checkpointer.load_checkpoint()
+
+print("HF weights:")
+for weight in sd["model"].keys():
+    print(weight)
+print()
+
+# Convert from TorchTune to Meta (PyTorch native)
+sd = qwen_2_tune_to_meta(sd['model'])
+
+print("Meta weights:")
+for weight in sd.keys():
+    print(weight)
+
+print("Saving checkpoint")
+torch.save(sd, "/home/jackzhxng/models/qwen2_5-1_5b.pth") 

From 110abd0117f824f591004719ab084d640df51f99 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 12 Feb 2025 12:08:15 -0800
Subject: [PATCH 02/12] Fix output embedding

---
 examples/models/qwen2_5/convert_weights.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/models/qwen2_5/convert_weights.py b/examples/models/qwen2_5/convert_weights.py
index 4341d260831..6478d4e8b2c 100644
--- a/examples/models/qwen2_5/convert_weights.py
+++ b/examples/models/qwen2_5/convert_weights.py
@@ -9,7 +9,6 @@
 _QWEN_2_FROM_META = {
     "tok_embeddings.weight": "tok_embeddings.weight",
     "norm.weight": "norm.scale",
-    "output.weight": "output.weight",
     "layers.{}.attention.wk.weight": "layers.{}.attn.k_proj.weight",
     "layers.{}.attention.wk.bias": "layers.{}.attn.k_proj.bias",
     "layers.{}.attention.wq.weight": "layers.{}.attn.q_proj.weight",
@@ -22,7 +21,6 @@
     "layers.{}.feed_forward.w1.weight": "layers.{}.mlp.w1.weight",
     "layers.{}.feed_forward.w2.weight": "layers.{}.mlp.w2.weight",
     "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
-    
 }
 
 def qwen_2_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
@@ -44,6 +42,9 @@ def qwen_2_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.
         new_key = get_mapped_key(key, inverted_mapping_dict)
         converted_state_dict[new_key] = value
 
+    # 0.5b and 1.5b models share the same weights for tok_embeddings and output embeddings, see https://github.com/QwenLM/Qwen2.5/issues/733.
+    converted_state_dict["output.weight"] = converted_state_dict["tok_embeddings.weight"]
+
     return converted_state_dict
 
 # TODO: no need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.

From 42fdb0da1f9c7f4af09705ed84a9cf9691303bd8 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 12 Feb 2025 12:10:33 -0800
Subject: [PATCH 03/12] Comment / lint

---
 examples/models/llama/rope.py              |  1 +
 examples/models/qwen2_5/convert_weights.py | 36 ++++++++++------------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index caca5907a0c..cfe74c172fa 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -233,6 +233,7 @@ def qwen_apply_rotary_emb(
     """
     Apply Qwen2-style RoPE to query and key tensors.
     """
+
     def rotate_half(x):
         """Rotates half the hidden dims of the input."""
         x1 = x[..., : x.shape[-1] // 2]
diff --git a/examples/models/qwen2_5/convert_weights.py b/examples/models/qwen2_5/convert_weights.py
index 6478d4e8b2c..0481799e974 100644
--- a/examples/models/qwen2_5/convert_weights.py
+++ b/examples/models/qwen2_5/convert_weights.py
@@ -1,10 +1,11 @@
 from typing import Dict
 
-from torchtune.training import FullModelHFCheckpointer
-# from torchtune.models import convert_weights
-from torchtune.models.convert_weights import get_mapped_key
 import torch
 
+from torchtune.models.convert_weights import get_mapped_key
+
+from torchtune.training import FullModelHFCheckpointer
+
 # Standard _FROM_META weight mapping of Meta weights to TorchTune + additional bias weight mappings.
 _QWEN_2_FROM_META = {
     "tok_embeddings.weight": "tok_embeddings.weight",
@@ -23,6 +24,7 @@
     "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
 }
 
+
 def qwen_2_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
     """
     Convert a state dict from torchtune's format to Meta's format. This function
@@ -43,32 +45,26 @@ def qwen_2_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.
         converted_state_dict[new_key] = value
 
     # 0.5b and 1.5b models share the same weights for tok_embeddings and output embeddings, see https://github.com/QwenLM/Qwen2.5/issues/733.
-    converted_state_dict["output.weight"] = converted_state_dict["tok_embeddings.weight"]
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
 
     return converted_state_dict
 
+
 # TODO: no need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
 checkpointer = FullModelHFCheckpointer(
-    checkpoint_dir='/home/jackzhxng/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/',
-    checkpoint_files=['model.safetensors'],
-    output_dir='.' ,
-    model_type='QWEN2'
+    checkpoint_dir="/home/jackzhxng/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/",
+    checkpoint_files=["model.safetensors"],
+    output_dir=".",
+    model_type="QWEN2",
 )
 
 print("Loading checkpoint")
 sd = checkpointer.load_checkpoint()
 
-print("HF weights:")
-for weight in sd["model"].keys():
-    print(weight)
-print()
-
-# Convert from TorchTune to Meta (PyTorch native)
-sd = qwen_2_tune_to_meta(sd['model'])
-
-print("Meta weights:")
-for weight in sd.keys():
-    print(weight)
+# Convert from TorchTune to Meta (PyTorch native).
+sd = qwen_2_tune_to_meta(sd["model"])
 
 print("Saving checkpoint")
-torch.save(sd, "/home/jackzhxng/models/qwen2_5-1_5b.pth") 
+torch.save(sd, "/home/jackzhxng/models/qwen2_5-1_5b.pth")

From 3ab0bd994971f9994f61b969d1f69af12dff32ec Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 12 Feb 2025 16:41:11 -0800
Subject: [PATCH 04/12] Add 1.5 config

---
 examples/models/qwen2_5/1_5b_config.json | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 examples/models/qwen2_5/1_5b_config.json

diff --git a/examples/models/qwen2_5/1_5b_config.json b/examples/models/qwen2_5/1_5b_config.json
new file mode 100644
index 00000000000..6ef6f3cc27e
--- /dev/null
+++ b/examples/models/qwen2_5/1_5b_config.json
@@ -0,0 +1,12 @@
+{
+  "dim": 1536,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 8960,
+  "n_heads": 12,
+  "n_kv_heads": 2,
+  "n_layers": 28,
+  "norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 151936
+}

From 0a17e3b30fa8878a7612b88110f9f2547b804cda Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 12 Feb 2025 17:15:41 -0800
Subject: [PATCH 05/12] Comment

---
 examples/models/qwen2_5/convert_weights.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/qwen2_5/convert_weights.py b/examples/models/qwen2_5/convert_weights.py
index 0481799e974..ce914539cf3 100644
--- a/examples/models/qwen2_5/convert_weights.py
+++ b/examples/models/qwen2_5/convert_weights.py
@@ -52,7 +52,7 @@ def qwen_2_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.
     return converted_state_dict
 
 
-# TODO: no need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
+# Don't necessarily need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
 checkpointer = FullModelHFCheckpointer(
     checkpoint_dir="/home/jackzhxng/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/",
     checkpoint_files=["model.safetensors"],

From a27ed67e5ce681bbd2a7b4b376bd9fa227183eef Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 12 Feb 2025 17:45:09 -0800
Subject: [PATCH 06/12] Remove qwen rope, use hf rope instead

---
 examples/models/llama/rope.py | 43 -----------------------------------
 1 file changed, 43 deletions(-)

diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index cfe74c172fa..e081c442032 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -210,46 +210,6 @@ def hf_apply_rotary_emb_to_k(k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return k_embed
 
 
-# ======================= Qwen2 Implementation ========================
-
-
-def qwen_precompute_freqs_cis(dim: int, end: int, theta: float = 1_000_000.0):
-    """
-    Precompute frequency tensor for Qwen2-style RoPE.
-    """
-    freqs = 1.0 / (
-        theta ** (torch.arange(0, dim, 2, device="cpu")[: (dim // 2)].float() / dim)
-    )
-    t = torch.arange(end, device=freqs.device)
-    freqs = torch.outer(t, freqs).float()
-    freqs_cos = torch.cos(freqs)
-    freqs_sin = torch.sin(freqs)
-    return freqs_cos, freqs_sin
-
-
-def qwen_apply_rotary_emb(
-    q: torch.Tensor, k: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply Qwen2-style RoPE to query and key tensors.
-    """
-
-    def rotate_half(x):
-        """Rotates half the hidden dims of the input."""
-        x1 = x[..., : x.shape[-1] // 2]
-        x2 = x[..., x.shape[-1] // 2 :]
-        return torch.cat((-x2, x1), dim=-1)
-
-    # Reshape cos and sin for broadcasting
-    cos = freqs_cos.unsqueeze(1)  # [seq_len, 1, head_dim]
-    sin = freqs_sin.unsqueeze(1)  # [seq_len, 1, head_dim]
-
-    # Apply rotation
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
 class Rope(torch.nn.Module):
     def __init__(self, params: ModelArgs):
         super().__init__()
@@ -259,9 +219,6 @@ def __init__(self, params: ModelArgs):
         if self.params.use_hf_rope:
             self.precompute_freqs_cis = hf_precompute_freqs_cis
             self.apply_rotary_emb = hf_apply_rotary_emb
-        # elif self.params.use_qwen_rope:
-        #     self.precompute_freqs_cis = qwen_precompute_freqs_cis
-        #     self.apply_rotary_emb = qwen_apply_rotary_emb
         else:
             self.precompute_freqs_cis = partial(
                 precompute_freqs_cis,

From 8aadf4510450a1aa98ce05897b14f9b136c961a4 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 13 Feb 2025 11:02:49 -0800
Subject: [PATCH 07/12] Back to meta

---
 examples/models/llama/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
index ac9d30c7e1b..f239952be79 100644
--- a/examples/models/llama/model.py
+++ b/examples/models/llama/model.py
@@ -171,7 +171,7 @@ def __init__(self, **kwargs):
 
         # Within the device="meta" context, tensors that are created do not carry data.
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
-        with torch.device("cpu"):
+        with torch.device("meta"):
             self.model_ = Transformer(model_args)
 
         if "int8" in str(checkpoint_path):

From 8b0b9f9eef516fa7c7a5b8c19a591711fb7816df Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 13 Feb 2025 11:36:36 -0800
Subject: [PATCH 08/12] Parametrize qkv bias

---
 examples/models/llama/attention.py       | 14 ++++++++++----
 examples/models/llama/model_args.py      |  1 +
 examples/models/qwen2_5/1_5b_config.json |  3 ++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
index 3d127e47f3d..66eeb10989f 100644
--- a/examples/models/llama/attention.py
+++ b/examples/models/llama/attention.py
@@ -175,10 +175,16 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         self.max_batch_size = args.max_batch_size
         self.max_context_len = args.max_context_len
         self.dim = args.dim
-        # TODO: parametrize bias for attention and feedforward.
-        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=True)
-        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=True)
-        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=True)
+        self.attention_qkv_bias = args.attention_qkv_bias
+        self.wq = nn.Linear(
+            self.dim, self.n_heads * self.head_dim, bias=self.attention_qkv_bias
+        )
+        self.wk = nn.Linear(
+            self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
+        )
+        self.wv = nn.Linear(
+            self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
+        )
         self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
 
         self.layer_id = layer_id
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
index e1c4edb8e93..28804839815 100644
--- a/examples/models/llama/model_args.py
+++ b/examples/models/llama/model_args.py
@@ -21,6 +21,7 @@ class ModelArgs:
     num_experts: int = 8  # Number of experts
     num_activated_experts: int = 2  # Number of experts to activate
     attention_type: str = "mha"  # Attention type, registered in attention.py
+    attention_qkv_bias: bool = False
     use_kv_cache: bool = False  # Use key/value cache
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
diff --git a/examples/models/qwen2_5/1_5b_config.json b/examples/models/qwen2_5/1_5b_config.json
index 6ef6f3cc27e..95de1f62dfc 100644
--- a/examples/models/qwen2_5/1_5b_config.json
+++ b/examples/models/qwen2_5/1_5b_config.json
@@ -8,5 +8,6 @@
   "norm_eps": 1e-06,
   "rope_theta": 1000000.0,
   "use_scaled_rope": false,
-  "vocab_size": 151936
+  "vocab_size": 151936,
+  "attention_qkv_bias": true
 }

From 52d7a1178f5181aa07b645d5a940214274652463 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 13 Feb 2025 12:35:05 -0800
Subject: [PATCH 09/12] Parametrize use hf rope

---
 examples/models/llama/model.py           | 1 -
 examples/models/qwen2_5/1_5b_config.json | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
index f239952be79..90582af4856 100644
--- a/examples/models/llama/model.py
+++ b/examples/models/llama/model.py
@@ -150,7 +150,6 @@ def __init__(self, **kwargs):
             input_prune_map=input_prune_map,
             output_prune_map=output_prune_map,
             enable_dynamic_shape=self.enable_dynamic_shape,
-            use_hf_rope=True,
             **params,
         )
 
diff --git a/examples/models/qwen2_5/1_5b_config.json b/examples/models/qwen2_5/1_5b_config.json
index 95de1f62dfc..64daca5a7cd 100644
--- a/examples/models/qwen2_5/1_5b_config.json
+++ b/examples/models/qwen2_5/1_5b_config.json
@@ -9,5 +9,6 @@
   "rope_theta": 1000000.0,
   "use_scaled_rope": false,
   "vocab_size": 151936,
+  "use_hf_rope": true,
   "attention_qkv_bias": true
 }

From 347c6fbc6c79e92443fa71e9360abd481d5ec564 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 19 Feb 2025 14:44:19 -0800
Subject: [PATCH 10/12] Clean up convert_weights

---
 examples/models/qwen2_5/convert_weights.py | 46 ++++++++++++++++------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/examples/models/qwen2_5/convert_weights.py b/examples/models/qwen2_5/convert_weights.py
index ce914539cf3..6b6c0bbdfe2 100644
--- a/examples/models/qwen2_5/convert_weights.py
+++ b/examples/models/qwen2_5/convert_weights.py
@@ -1,3 +1,4 @@
+import argparse
 from typing import Dict
 
 import torch
@@ -52,19 +53,38 @@ def qwen_2_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.
     return converted_state_dict
 
 
-# Don't necessarily need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
-checkpointer = FullModelHFCheckpointer(
-    checkpoint_dir="/home/jackzhxng/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/",
-    checkpoint_files=["model.safetensors"],
-    output_dir=".",
-    model_type="QWEN2",
-)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Qwen2 weights to Meta format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing checkpoint files",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
 
-print("Loading checkpoint")
-sd = checkpointer.load_checkpoint()
+    args = parser.parse_args()
 
-# Convert from TorchTune to Meta (PyTorch native).
-sd = qwen_2_tune_to_meta(sd["model"])
+    # Don't necessarily need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
+    checkpointer = FullModelHFCheckpointer(
+        # checkpoint_dir="/home/jackzhxng/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/",
+        checkpoint_dir=args.input_dir,
+        checkpoint_files=["model.safetensors"],
+        output_dir=".",
+        model_type="QWEN2",
+    )
 
-print("Saving checkpoint")
-torch.save(sd, "/home/jackzhxng/models/qwen2_5-1_5b.pth")
+    print("Loading checkpoint...")
+    sd = checkpointer.load_checkpoint()
+
+    print("Converting checkpoint...")
+    sd = qwen_2_tune_to_meta(sd["model"])
+    # torch.save(sd, "/home/jackzhxng/models/qwen2_5-1_5b.pth")
+
+    torch.save(sd, args.output)
+    print(f"Checkpoint saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()

From 44aa34d37b3d59ce05974840d73d6f39561de94e Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 19 Feb 2025 14:44:30 -0800
Subject: [PATCH 11/12] Add README.md

---
 examples/models/qwen2_5/README.md | 66 +++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 examples/models/qwen2_5/README.md

diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md
new file mode 100644
index 00000000000..aede40ec4dc
--- /dev/null
+++ b/examples/models/qwen2_5/README.md
@@ -0,0 +1,66 @@
+## Summary
+Qwen 2.5 is the latest iteration of the Qwen series of large language models (LLMs) developed by Alibaba. At the moment, 1.5b is currently supporting, with plans in the future for adding the 0.5b and 3b versions.
+
+## Instructions
+
+Qwen 2.5 uses the same example code as Llama, while the checkpoint, model params, and tokenizer are different. Please see the [Llama README page](../llama/README.md) for details.
+
+All commands for exporting and running Llama on various backends should also be applicable to Qwen 2.5, by swapping the following args:
+```
+--model qwen2_5
+--params examples/models/qwen2_5/1_5b_config.json
+--checkpoint <path-to-meta-checkpoint>
+```
+
+### Generate the Checkpoint
+The original checkpoint can be obtained from HuggingFace:
+```
+huggingface-cli download Qwen/Qwen2.5-1.5B
+```
+
+We then convert it to Meta's checkpoint format:
+```
+python examples/models/qwen2_5/convert_weights.py <path-to-checkpoint-dir> <output-path>
+```
+
+### Example export and run
+Here is an basic example for exporting and running Qwen 2.5, although please refer to [Llama README page](../llama/README.md) for more advanced usage.
+
+Export to XNNPack, no quantization:
+```
+# No quantization
+# Set these paths to point to the downloaded files
+QWEN_CHECKPOINT=path/to/checkpoint.pth
+
+python -m examples.models.llama.export_llama \
+  --model "qwen2_5" \
+  --checkpoint "${QWEN_CHECKPOINT:?}" \
+  --params examples/models/qwen2_5/1_5b_config.json \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -d fp32 \
+  -X \
+  --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+  --output_name="qwen2_5-1_5b.pte"
+  --verbose
+```
+
+Run using the executor runner:
+```
+# Currently a work in progress, just need to enable HuggingFace json tokenizer in C++.
+# In the meantime, can run with an example Python runner with pybindings:
+
+python -m examples.models.llama.runner.native
+  --model qwen2_5
+  --pte <path-to-pte>
+  -kv
+  --tokenizer <path-to-tokenizer>/tokenizer.json
+  --tokenizer_config <path-to_tokenizer>/tokenizer_config.json
+  --prompt "Who is the founder of Meta?"
+  --params examples/models/qwen2_5/1_5b_config.json
+  --max_len 64
+  --temperature 0
+```
+
+
+

From 93064d24827193c1e80348ff06e04c9d51a7aa07 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 21 Feb 2025 08:08:36 -0800
Subject: [PATCH 12/12] Bias for static attention

---
 examples/models/llama/static_attention.py | 7 ++++---
 examples/models/qwen2_5/README.md         | 3 ---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
index 8b341a3aafd..72ed4e1dfff 100644
--- a/examples/models/llama/static_attention.py
+++ b/examples/models/llama/static_attention.py
@@ -145,22 +145,23 @@ def __init__(self, config: ModelArgs, layer_id: int, rope: Rope):
         self.dim = config.dim
         self.head_dim = config.head_dim
         self.inv_scale = 1.0 / (float(self.head_dim) ** 0.5)
+        self.attention_qkv_bias = config.attention_qkv_bias
 
         self.wqs = nn.ModuleList(
             [
-                nn.Linear(self.dim, self.head_dim, bias=False)
+                nn.Linear(self.dim, self.head_dim, bias=self.attention_qkv_bias)
                 for _ in range(self.n_heads)
             ]
         )
         self.wks = nn.ModuleList(
             [
-                nn.Linear(self.dim, self.head_dim, bias=False)
+                nn.Linear(self.dim, self.head_dim, bias=self.attention_qkv_bias)
                 for _ in range(self.n_kv_heads)
             ]
         )
         self.wvs = nn.ModuleList(
             [
-                nn.Linear(self.dim, self.head_dim, bias=False)
+                nn.Linear(self.dim, self.head_dim, bias=self.attention_qkv_bias)
                 for _ in range(self.n_kv_heads)
             ]
         )
diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md
index aede40ec4dc..9bf791a35ed 100644
--- a/examples/models/qwen2_5/README.md
+++ b/examples/models/qwen2_5/README.md
@@ -61,6 +61,3 @@ python -m examples.models.llama.runner.native
   --max_len 64
   --temperature 0
 ```
-
-
-