[Qwen3] Switch to verified RoPE implementation + Add weight tying support (#1590)

wwwjn · web-flow · commit 7f1fa48157cb · 2025-08-18T22:08:18.000-07:00
## Context 1. Current Qwen3 model RoPE used some trick to make numerical parity with HF. This trick is from un-official source and hard to reasoning mathematically. Switch to [torchtune based implementation](https://github.com/pytorch/torchtune/blob/main/torchtune/models/qwen2/_positional_embeddings.py#L14), which was directly contributed from Qwen team. Thanks @ebsmothers for point us to this implementation! - For RoPE embedding, I change it to the same way as complex representation based RoPE in llama3: We initialize and precompute the RoPE embedding cos/sin value only once, and pass it into Attention module during forward. In this way, TP can be applied seamlessly. - In contrast, torchtune passed the RoPE class into initialize function for each layers' attention module. 2. Add weight tying support for Qwen3, verified with FSDP + TP ## Numerical verification for RoPE Run end-to-end forward pass of Qwen3 model, the output and <img width="812" height="412" alt="Screenshot 2025-08-18 at 2 48 48 PM" src="https://github.com/user-attachments/assets/618dde58-6546-4cdf-bd8c-2b828a5afa91" /> ## Weight tying Verification: 1. With vs. without weight tying on torchtitan model: (FSDP=4, loss are exactly the same) <img width="772" height="412" alt="Screenshot 2025-08-18 at 6 19 13 PM" src="https://github.com/user-attachments/assets/c0cfa049-c5c9-42a9-9133-b6ee32e9b9b4" /> 2. torchtitan with weight tying vs. HF <img width="732" height="507" alt="Screenshot 2025-08-18 at 9 37 50 PM" src="https://github.com/user-attachments/assets/c4a51310-df02-4ade-8721-a17678445d52" /> 3. Weight tying memory address / id check: (in train.py) - passed ``` assert id(model.tok_embeddings.weight) == id(model.output.weight), "id check 2" assertEqual(model.tok_embeddings.weight, model.output.weight) # model.forward() assert id(model.tok_embeddings.weight.grad) == id(model.output.weight.grad), "id check 2" assertEqual(model.tok_embeddings.weight.grad, model.output.weight.grad) ```
diff --git a/torchtitan/experiments/qwen3/__init__.py b/torchtitan/experiments/qwen3/__init__.py
@@ -16,12 +16,12 @@
 
 from .infra.parallelize import parallelize_qwen3
 from .model.args import Qwen3ModelArgs
-from .model.model import Transformer
+from .model.model import Qwen3Model
 
 __all__ = [
     "parallelize_qwen3",
     "Qwen3ModelArgs",
-    "Transformer",
+    "Qwen3Model",
     "qwen3_configs",
 ]
 
@@ -107,7 +107,7 @@
 register_train_spec(
     TrainSpec(
         name="qwen3",
-        model_cls=Transformer,
+        model_cls=Qwen3Model,
         model_args=qwen3_configs,  # Change from dict to Mapping
         parallelize_fn=parallelize_qwen3,
         pipelining_fn=None,
diff --git a/torchtitan/experiments/qwen3/infra/parallelize.py b/torchtitan/experiments/qwen3/infra/parallelize.py
@@ -120,6 +120,10 @@ def parallelize_qwen3(
             enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
         )
 
+    # Enable weight tying after applying parallelisms
+    if model.model_args.enable_weight_tying:
+        model.output.weight = model.tok_embeddings.weight
+
     return model
 
 
diff --git a/torchtitan/experiments/qwen3/model/args.py b/torchtitan/experiments/qwen3/model/args.py
@@ -38,6 +38,8 @@ class Qwen3ModelArgs(BaseModelArgs):
     attn_mask_type: str = "causal"
     eos_id: int = 151645
 
+    enable_weight_tying: bool = False
+
     def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
         seq_len = job_config.training.seq_len
         if seq_len > self.max_seq_len:
diff --git a/torchtitan/experiments/qwen3/model/model.py b/torchtitan/experiments/qwen3/model/model.py
@@ -16,99 +16,77 @@
 
 from .args import Qwen3ModelArgs
 
-
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
-    """
-    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
-
-    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
-    and the end index 'end'. The 'theta' parameter scales the frequencies.
-    The returned tensor contains complex values in complex64 data type.
-
-    Args:
-        dim (int): Dimension of the frequency tensor.
-        end (int): End index for precomputing frequencies.
-        theta (float | None): Scaling factor for frequency computation. Defaults to 10000.0.
-
-    Returns:
-        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+# Adapted from https://github.com/pytorch/torchtune/blob/main/torchtune/models/qwen2/_positional_embeddings.py
+def precompute_rope_cache(
+    dim: int, max_seq_len: int, base: float = 1_000_000.0
+) -> torch.Tensor:
+    freqs = 1.0 / (base ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    # Create position indexes `[0, 1, ..., max_seq_len - 1]`
+    t = torch.arange(max_seq_len, dtype=freqs.dtype, device=freqs.device)
+
+    # Outer product of theta and position index; output tensor has
+    # a shape of [max_seq_len, dim // 2]
+    idx_theta = torch.outer(t, freqs).float()
+
+    # We cache the cos and sin embeddings instead of the IDs. This helps
+    # ensure we have correct behavior when training with bf16
+    # Size: [max_seq_len, (dim * 2)]
+    freqs = torch.cat([idx_theta, idx_theta], dim=-1)
+    rope_cache = torch.cat([freqs.cos(), freqs.sin()], dim=-1)
+    return rope_cache
+
+
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def reshape_for_broadcast(rope_cache: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
     """
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-    t = torch.arange(end, device=freqs.device)
-    freqs = torch.outer(t, freqs).float()
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
-    return freqs_cis
-
-
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-    """
-    Reshape frequency tensor for broadcasting it with another tensor.
+    Reshape frequency tensor (represented by cos, sin) for broadcasting it with another tensor.
 
     This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
     for the purpose of broadcasting the frequency tensor during element-wise operations.
 
-    The input freqs_cis tensor is assumed to be of shape (max_seqlen, dim),
+    The input freqs_cis tensor is assumed to be of shape (max_seqlen, head_dim * 2),
     and the first seqlen elements will be sliced, but dim must match x.
 
     Args:
-        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        rope_cache (torch.Tensor): RoPE tensor (cos and sin) to be reshaped.
         x (torch.Tensor): Target tensor for broadcasting compatibility.
 
     Returns:
         torch.Tensor: Reshaped frequency tensor.
     """
     ndim = x.ndim
     assert ndim > 1
-    seqlen = x.shape[1]
-    freqs_cis = freqs_cis[0:seqlen]
-    assert freqs_cis.shape == (seqlen, x.shape[-1])
-    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(*shape)
+    _, seqlen, _, head_dim = x.shape
+    rope_cache = rope_cache[0:seqlen]
+    # The shape of rope_cache is (seqlen, head_dim * 2) because we concate cos and sin
+    assert rope_cache.shape == (seqlen, head_dim * 2)
+    shape = [-1, seqlen, 1, head_dim * 2]
+    return rope_cache.view(*shape)
 
 
 def apply_rotary_emb(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cis: torch.Tensor,
+    xq: torch.Tensor, xk: torch.Tensor, rope_cache: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor.
+    # input tensor x has shape [bsz, seq_len, num_heads, head_dim]
+    head_dim = xq.shape[-1]
 
-    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
-    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
-    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
-    returned as real tensors.
+    # reshape for broadcast
+    rope_cache = reshape_for_broadcast(rope_cache, xq)
 
-    Args:
-        xq (torch.Tensor): Query tensor to apply rotary embeddings.
-        xk (torch.Tensor): Key tensor to apply rotary embeddings.
-        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.
-
-    Returns:
-        tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-    Note:
-        This function adds .transpose(-2,-1) to match HF implementation. This method assumes that last
-        dimension is [real_0, real_1, ..., real_{N-1}, imag_0, imag_1, ..., imag_{N-1}] while Rope in Llama3
-        has [real_0, imag_0, real_1, imag_1, ..., real_{N-1}, imag_{N-1}]. This is the main difference
-        between Llama3 and Qwen3 Rope which is under investigation.
-    """
-    xk_complex = torch.view_as_complex(
-        xk.view(*xk.shape[:-1], 2, xk.shape[-1] // 2)
-        .transpose(-2, -1)
-        .contiguous()
-        .float()
-    )
-    xq_complex = torch.view_as_complex(
-        xq.view(*xq.shape[:-1], 2, xq.shape[-1] // 2)
-        .transpose(-2, -1)
-        .contiguous()
-        .float()
-    )
-    freqs_cis = reshape_for_broadcast(freqs_cis, xq_complex)
-
-    xq_out = torch.view_as_real(xq_complex * freqs_cis).flatten(3)
-    xk_out = torch.view_as_real(xk_complex * freqs_cis).flatten(3)
+    # [bsz, seq_len, 1, head_dim]
+    cos = rope_cache[..., :head_dim].to(dtype=xq.dtype, device=xq.device)
+    sin = rope_cache[..., head_dim:].to(dtype=xq.dtype, device=xq.device)
 
+    # xq:  [bsz, seq_len, num_heads, head_dim]
+    # xk:  [bsz, seq_len, num_kv_heads, head_dim]
+    xq_out = (xq * cos) + (rotate_half(xq) * sin)
+    xk_out = (xk * cos) + (rotate_half(xk) * sin)
     return xq_out.type_as(xq), xk_out.type_as(xk)
 
 
@@ -189,14 +167,13 @@ def init_weights(self, init_std: float):
     def forward(
         self,
         x: torch.Tensor,
-        freqs_cis: torch.Tensor,
+        rope_cache: torch.Tensor,
     ):
         """
         Forward pass of the attention module.
 
         Args:
             x (torch.Tensor): Input tensor.
-            freqs_cis (torch.Tensor): Precomputed frequency tensor.
 
         Returns:
             torch.Tensor: Output tensor after attention.
@@ -220,9 +197,10 @@ def forward(
         if self.k_norm:
             xk = self.k_norm(xk)
 
-        # repeat k/v heads if n_kv_heads < n_heads
-        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        # Apply rotary embedding
+        xq, xk = apply_rotary_emb(xq, xk, rope_cache)
 
+        # repeat k/v heads if n_kv_heads < n_heads
         keys = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
         values = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
 
@@ -318,7 +296,7 @@ def __init__(self, layer_id: int, model_args: Qwen3ModelArgs):
     def forward(
         self,
         x: torch.Tensor,
-        freqs_cis: torch.Tensor,
+        rope_cache: torch.Tensor,
     ):
         """
         Perform a forward pass through the TransformerBlock.
@@ -331,7 +309,7 @@ def forward(
             torch.Tensor: Output tensor after applying attention and feedforward layers.
 
         """
-        h = x + self.attention(self.attention_norm(x), freqs_cis)
+        h = x + self.attention(self.attention_norm(x), rope_cache)
         out = h + self.feed_forward(self.ffn_norm(h))
         return out
 
@@ -342,9 +320,9 @@ def init_weights(self):
         self.feed_forward.init_weights(self.weight_init_std)
 
 
-class Transformer(nn.Module, ModelProtocol):
+class Qwen3Model(nn.Module, ModelProtocol):
     """
-    Transformer Module
+    Qwen3Model Module
 
     Args:
         model_args (TransformerModelArgs): Model configuration arguments.
@@ -370,13 +348,18 @@ def __init__(self, model_args: Qwen3ModelArgs):
         self.head_dim = model_args.head_dim
 
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
-        self.register_buffer("freqs_cis", self._precompute_freqs_cis(), persistent=True)
+
+        self.register_buffer(
+            "rope_cache", self._precompute_rope_cache(), persistent=False
+        )
 
         self.layers = torch.nn.ModuleDict()
         for layer_id in range(model_args.n_layers):
             self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args)
         self.norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
+
         self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
+
         self.init_weights()
 
     def init_weights(
@@ -394,9 +377,9 @@ def init_weights(
         ``init_weights``. We only call it in the constructor of this
         ``Transformer`` root module to avoid reinitializing tensors.
         """
-        buffer_device = buffer_device or self.freqs_cis.device
+        buffer_device = buffer_device or self.rope_cache.device
         with torch.device(buffer_device):
-            self.freqs_cis = self._precompute_freqs_cis()
+            self.rope_cache = self._precompute_rope_cache()
         if self.tok_embeddings is not None:
             nn.init.normal_(self.tok_embeddings.weight)
         for layer in self.layers.values():
@@ -406,6 +389,8 @@ def init_weights(
             self.norm.reset_parameters()
         final_out_std = self.model_args.dim**-0.5
         cutoff_factor = 3
+
+        # If weight tying is enabled, we don't need to initialize the output layer
         if self.output is not None:
             nn.init.trunc_normal_(
                 self.output.weight,
@@ -415,12 +400,9 @@ def init_weights(
                 b=cutoff_factor * final_out_std,
             )
 
-    def _precompute_freqs_cis(self) -> torch.Tensor:
-        return precompute_freqs_cis(
-            self.head_dim,
-            # Need to compute until at least the max token limit for generation
-            # TODO: explain in docs/composability.md why we removed the 2x
-            # relaxing in our CP enablement PR
+    def _precompute_rope_cache(self) -> torch.Tensor:
+        return precompute_rope_cache(
+            self.model_args.head_dim,
             self.model_args.max_seq_len,
             self.model_args.rope_theta,
         )
@@ -459,7 +441,7 @@ def forward(
         h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
 
         for layer in self.layers.values():
-            h = layer(h, self.freqs_cis)
+            h = layer(h, self.rope_cache)
 
         h = self.norm(h) if self.norm else h
         output = self.output(h) if self.output else h
diff --git a/torchtitan/experiments/qwen3/train_configs/qwen3_0.6b.toml b/torchtitan/experiments/qwen3/train_configs/qwen3_0.6b.toml
@@ -8,14 +8,14 @@ save_traces_folder = "profile_trace"
 profile_freq = 100
 
 [metrics]
-log_freq = 10
+log_freq = 1
 enable_tensorboard = false
 save_tb_folder = "tb"
 
 [model]
 name = "qwen3"
 flavor = "0.6B"
-tokenizer_path = "./assets/tokenizer/Qwen3-0.6B"
+hf_assets_path = "./assets/hf/Qwen3-0.6B"
 # converters = ["float8"]
 
 [optimizer]
@@ -24,7 +24,7 @@ lr = 3e-4
 eps = 1e-8
 
 [lr_scheduler]
-warmup_steps = 1  # lr scheduler warm up
+warmup_steps = 2  # lr scheduler warm up, 20% total steps
 
 [training]
 local_batch_size = 4
@@ -34,7 +34,6 @@ steps = 10
 compile = false
 dataset = "c4"
 
-
 [parallelism]
 data_parallel_replicate_degree = 1
 data_parallel_shard_degree = -1

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,10 @@ def parallelize_qwen3(`
`120`	`120`	`enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,`
`121`	`121`	`)`
`122`	`122`
	`123`	`+ # Enable weight tying after applying parallelisms`
	`124`	`+ if model.model_args.enable_weight_tying:`
	`125`	`+ model.output.weight = model.tok_embeddings.weight`
	`126`	`+`
`123`	`127`	`return model`
`124`	`128`
`125`	`129`