AI-Hypercomputer
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 2 additions & 0 deletions b/‎src/MaxText/configs/base.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/MaxText/configs/models/qwen3-next-80b-a3b.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/MaxText/configs/models/qwen3-next-80b-a3b.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/MaxText/layers/attentions.py‎
Lines changed: 64 additions & 11 deletions b/‎src/MaxText/layers/attentions.py‎
Lines changed: 64 additions & 11 deletions
diff --git a/‎src/MaxText/layers/embeddings.py‎
Lines changed: 91 additions & 0 deletions b/‎src/MaxText/layers/embeddings.py‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎src/MaxText/layers/normalizations.py‎
Lines changed: 80 additions & 2 deletions b/‎src/MaxText/layers/normalizations.py‎
Lines changed: 80 additions & 2 deletions
@@ -905,6 +905,8 @@ gdn_num_value_heads: 32
 gdn_chunk_size: 64
 # Whether to apply L2 normalization to query and key tensors inside the Gated Delta Rule kernel.
 use_qk_norm_in_gdn: True
+# The ratio of dimension to apply ROPE on
+partial_rotary_factor: 1.0
 
 # Use tokamax library for gmm kernel implementation
 use_tokamax_gmm: false
 
@@ -45,3 +45,4 @@ gdn_chunk_size: 64
 
 # RoPE Settings
 rope_max_timescale: 10000000
+partial_rotary_factor: 0.25
@@ -65,10 +65,11 @@
     LlamaVisionRotaryEmbedding,
     RotaryEmbedding,
     YarnRotaryEmbedding,
+    Qwen3NextRotaryEmbedding,
 )
 from MaxText.layers.initializers import nd_dense_init, NdInitializer, variable_to_logically_partitioned, default_bias_init
 from MaxText.layers.linears import DenseGeneral, canonicalize_tuple, normalize_axes
-from MaxText.layers.normalizations import RMSNorm
+from MaxText.layers.normalizations import RMSNorm, Qwen3NextRMSNorm
 from MaxText.layers.quantizations import AqtQuantization as Quant
 
 # pylint: disable=line-too-long, g-doc-args, g-doc-return-or-yield, bad-continuation, g-inconsistent-quotes
@@ -416,6 +417,8 @@ def __init__(
     self.model_mode = model_mode
     self.rngs = rngs
 
+    self.is_qwen3_next = self.config.decoder_block == DecoderBlockType.QWEN3_NEXT
+
     # Module attribute names must match names previously passed to Linen for checkpointing
     self.KVCache_0 = (
         self.init_kv_caches(inputs_kv_shape=inputs_kv_shape)
@@ -478,6 +481,9 @@ def __init__(
     else:
       self.sinks = None
 
+    self.query_norm = None
+    self.key_norm = None
+
     is_llama4_decoder_block = self.config.decoder_block == DecoderBlockType.LLAMA4
     if self.use_qk_norm and not is_llama4_decoder_block:
       self.query_norm = RMSNorm(
@@ -498,9 +504,21 @@ def __init__(
           kernel_axes=("norm",),
           rngs=self.rngs,
       )
-    else:
-      self.query_norm = None
-      self.key_norm = None
+    elif self.is_qwen3_next:
+      self.query_norm = Qwen3NextRMSNorm(
+          num_features=self.config.head_dim,
+          eps=self.config.normalization_layer_epsilon,
+          dtype=self.config.dtype,
+          weight_dtype=self.config.weight_dtype,
+          rngs=self.rngs,
+      )
+      self.key_norm = Qwen3NextRMSNorm(
+          num_features=self.config.head_dim,
+          eps=self.config.normalization_layer_epsilon,
+          dtype=self.config.dtype,
+          weight_dtype=self.config.weight_dtype,
+          rngs=self.rngs,
+      )
 
     self._maybe_shard_with_logical = functools.partial(
         maybe_shard_with_logical,
@@ -538,9 +556,15 @@ def query_init(*args):
     kernel_axes = (
         (None, None, None) if self.config.ici_context_autoregressive_parallelism > 1 else ("embed", "q_heads", "kv")
     )
+    in_features = self.convert_dense_general_inputs_shape(inputs_q_shape)
+    out_features = (self.num_query_heads, self.head_dim)
+
+    if self.is_qwen3_next:
+      out_features = (self.num_query_heads, self.head_dim * 2)
+
     return DenseGeneral(
-        in_features_shape=self.convert_dense_general_inputs_shape(inputs_q_shape),
-        out_features_shape=(self.num_query_heads, self.head_dim),
+        in_features_shape=in_features,
+        out_features_shape=out_features,
         axis=-1,
         kernel_init=query_init,
         kernel_axes=kernel_axes,
@@ -642,13 +666,22 @@ def qkv_projection(self, inputs: Array, proj_name: str, out_sharding: NamedShard
 
   def init_out_w(self, output_dim: int) -> nnx.Module:
     """out projection"""
+    in_features = (self.num_query_heads, self.head_dim)
+    out_features = output_dim
     out_kernel_axis = (
         (None, None, None) if self.config.ici_context_autoregressive_parallelism > 1 else ("heads", "kv", "embed")
     )
+    axis = (-2, -1)
+
+    if self.is_qwen3_next:
+      in_features = self.num_query_heads * self.head_dim
+      out_kernel_axis = ("mlp", "embed")
+      axis = (-1,)
+
     return DenseGeneral(
-        in_features_shape=(self.num_query_heads, self.head_dim),
-        out_features_shape=output_dim,
-        axis=(-2, -1),
+        in_features_shape=in_features,
+        out_features_shape=out_features,
+        axis=axis,
         kernel_init=self.kernel_init,
         kernel_axes=out_kernel_axis,  # trade speed with memory
         dtype=self.dtype,
@@ -720,6 +753,16 @@ def init_rotary_embedding(self):
           attention_scaling=self.config.rope_attention_scaling,
           rngs=self.rngs,
       )
+    elif self.is_qwen3_next:
+      rotary_embedding = Qwen3NextRotaryEmbedding(
+          min_timescale=self.config.rope_min_timescale,
+          max_timescale=self.config.rope_max_timescale,
+          embedding_dims=self.config.head_dim,
+          partial_rotary_factor=self.config.partial_rotary_factor,
+          cast_as_fprop_dtype=True,
+          fprop_dtype=self.config.dtype,
+          rngs=self.rngs,
+      )
     else:
       max_timescale = self.config.rope_max_timescale
       # For local attention use local_rope_max_timescale if it's is positive
@@ -890,9 +933,17 @@ def __call__(
       value_sharding = NamedSharding(self.mesh, nn.logical_to_mesh_axes(self.value_axis_names))
       value = self.kv_projection(inputs_kv, proj_name="value", out_sharding=value_sharding)
 
+    gate = None
+    if self.is_qwen3_next:
+      # Split query into query & gate.
+      query, gate = jnp.split(query, 2, axis=-1)
+      batch_size, seq_len, _, _ = gate.shape
+      gate = gate.reshape(batch_size, seq_len, self.config.num_query_heads * self.config.head_dim)
+
     is_llama4_decoder_block = self.config.decoder_block == DecoderBlockType.LLAMA4
     # NOTE: llama 4 does L2 normalization after RoPE
-    if self.use_qk_norm and not is_llama4_decoder_block:
+    # Apply Qwen3Next specific RMS Norm
+    if (self.use_qk_norm and not is_llama4_decoder_block) or self.is_qwen3_next:
       query = self.query_norm(query)
       key = self.key_norm(key)
 
@@ -964,7 +1015,9 @@ def __call__(
           bidirectional_mask,
           self.sinks,
       )
-
+    if self.is_qwen3_next:
+      out = out.reshape(batch_size, seq_len, self.config.num_query_heads * self.config.head_dim)
+      out = out * jax.nn.sigmoid(gate)
     if model_mode == MODEL_MODE_PREFILL:
       out = self._maybe_shard_with_logical(out, self.prefill_out_axis_names)
     elif model_mode == MODEL_MODE_TRAIN and self.config.expert_shard_attention_option == EP_AS_CONTEXT:
 
@@ -380,6 +380,97 @@ def llama_rotary_embedding_as_linen(
   )
 
 
+def qwen3_next_rotary_embedding_as_linen(
+    *,
+    min_timescale: int,
+    max_timescale: int,
+    embedding_dims: int = 0,
+    partial_rotary_factor: float = 0.25,
+    cast_as_fprop_dtype: bool = True,
+    fprop_dtype: DType = jnp.bfloat16,
+    name: str | None = None,
+):
+  """Initializes the Qwen3NextRotaryEmbedding module and returns it as a Linen module.
+
+  Args:
+    min_timescale: Start of the geometric index. Determines the periodicity of
+      the added signal.
+    max_timescale: End of the geometric index. Determines the frequency of the
+      added signal.
+    embedding_dims: Dimension of the embedding to be generated.
+    partial_rotary_factor: Ratio of dimensions to apply ROPE to.
+    cast_as_fprop_dtype: Whether to cast the output to the fprop dtype.
+    fprop_dtype: The dtype of the output.
+    name: Name of the Linen module.
+  """
+  return nnx_wrappers.to_linen(
+      Qwen3NextRotaryEmbedding,
+      min_timescale=min_timescale,
+      max_timescale=max_timescale,
+      embedding_dims=embedding_dims,
+      partial_rotary_factor=partial_rotary_factor,
+      cast_as_fprop_dtype=cast_as_fprop_dtype,
+      fprop_dtype=fprop_dtype,
+      metadata_fn=variable_to_logically_partitioned,
+      name=name,
+  )
+
+
+class Qwen3NextRotaryEmbedding(RotaryEmbedding):
+  """Qwen3 Next variant of ROPE (partial ROPE)"""
+
+  def __init__(
+      self,
+      min_timescale: int,
+      max_timescale: int,
+      embedding_dims: int = 0,
+      cast_as_fprop_dtype: bool = True,
+      fprop_dtype: DType = jnp.bfloat16,
+      partial_rotary_factor: float = 0.25,
+      rngs: nnx.Rngs = None,
+  ):
+    """Initializes the Qwen3NextRotaryEmbedding module.
+
+    Args:
+      min_timescale: Start of the geometric index. Determines the periodicity of
+        the added signal.
+      max_timescale: End of the geometric index. Determines the frequency of the
+        added signal.
+      embedding_dims: Dimension of the embedding to be generated.
+      partial_rotary_factor: Ratio of dimensions to apply ROPE to
+      rngs: rng keys passed in by nnx.bridge.to_linen.
+    """
+    self.head_dim = embedding_dims
+    self.partial_rotary_factor = partial_rotary_factor
+    self.rotary_dim = int(self.head_dim * self.partial_rotary_factor)
+
+    super().__init__(
+        min_timescale=min_timescale,
+        max_timescale=max_timescale,
+        embedding_dims=self.rotary_dim,
+        cast_as_fprop_dtype=cast_as_fprop_dtype,
+        fprop_dtype=fprop_dtype,
+        rngs=rngs,
+    )
+
+  def __call__(self, inputs: jax.Array, position: None | jax.Array = None) -> jax.Array:
+    """Applies LLaMA variant of rotary position embedding.
+
+    Args:
+      inputs: The input sequence on which to apply the Rotary position
+        embedding. It is assumed of shape [B, S, H, D].
+      position: Optional position array [B, S]. Only needed when the sequence
+        is packed.
+
+    Returns:
+      A jax.Array of shape [B, S, H, D - rotary_dim] with rotary position embeddings applied.
+    """
+    inputs_rot, inputs_pass = jnp.split(inputs, [self.rotary_dim], axis=-1)
+    inputs_rot = super().__call__(inputs_rot, position)
+    inputs = jnp.concatenate([inputs_rot, inputs_pass], axis=-1)
+    return inputs
+
+
 class LLaMARotaryEmbedding(RotaryEmbedding):
   """LLaMA variant of ROPE."""
 
 
@@ -18,6 +18,7 @@
 
 from flax import linen as nn
 from flax import nnx
+from flax.linen import initializers as linen_initializers
 from jax import lax
 import jax
 import jax.numpy as jnp
@@ -26,7 +27,7 @@
 from MaxText import max_utils
 from MaxText.layers import nnx_wrappers
 from MaxText.layers.initializers import Initializer, variable_to_logically_partitioned
-from MaxText.common_types import Array, ShardMode
+from MaxText.common_types import Array, DType, ShardMode
 
 
 class RMSNorm(nnx.Module):
@@ -42,6 +43,7 @@ def __init__(
       kernel_axes: tuple[None | str, ...] = (),
       scale_init: Initializer = nn.initializers.ones,
       parameter_memory_host_offload: bool = False,
+      scale_offset: float = 0.0,
       *,
       rngs: nnx.Rngs,
   ):
@@ -53,6 +55,7 @@ def __init__(
     self.kernel_axes = kernel_axes
     self.scale_init = scale_init
     self.parameter_memory_host_offload = parameter_memory_host_offload
+    self.scale_offset = scale_offset
     self.scale = nnx.Param(
         scale_init(rngs.params(), (num_features,), weight_dtype),
         sharding=kernel_axes,
@@ -73,8 +76,83 @@ def __call__(self, x: jnp.ndarray, out_sharding: NamedSharding | None = None) ->
       out_sharding = None
 
     scale = jnp.asarray(scale, self.dtype)
+    effective_scale = scale + self.scale_offset  # Apply offset
     # broadcast 2nd input then element-wise mul
-    return jnp.einsum("i...k,...k->i...k", y, scale, out_sharding=out_sharding)
+    return jnp.einsum("i...k,...k->i...k", y, effective_scale, out_sharding=out_sharding)
+
+
+def Qwen3NextRMSNorm(num_features: int, eps: float, dtype: DType, weight_dtype: DType, *, rngs: nnx.Rngs):
+  """
+  Used for input and post attention layernorms
+  in Qwen3NextDecoderLayer.
+
+  This normalization layer is specific to Qwen3-Next. Key characteristics:
+  1.  The learnable scale parameter `scale` is initialized to ZEROS.
+  2.  The scale is applied as `(1.0 + self.scale)`, making the initial scale effectively 1.0.
+      This matches the PyTorch implementation of Qwen3NextRMSNorm.
+  """
+  return nnx.data(
+      RMSNorm(
+          num_features=num_features,
+          epsilon=eps,
+          dtype=dtype,
+          weight_dtype=weight_dtype,
+          scale_init=linen_initializers.zeros,
+          scale_offset=1.0,
+          rngs=rngs,
+      )
+  )
+
+
+class Qwen3NextRMSNormGated(nnx.Module):
+  """
+  This applies RMS Normalization and then a gated activation function (SiLU).
+  This is used within the Qwen3NextGatedDeltaNet.
+
+  The normalization is performed by an internal `RMSNorm` instance (`self.rms_norm`),
+  which has its own learnable `scale` parameter, initialized to ONES.
+
+  Attributes:
+    num_features: The number of features in the input.
+    eps: A small epsilon value to prevent division by zero in RMSNorm.
+    dtype: The datatype of the computation.
+    weight_dtype: The datatype of the internal RMSNorm scale.
+  """
+
+  def __init__(self, num_features: int, eps: float, dtype: DType, weight_dtype: DType, *, rngs: nnx.Rngs):
+    self.num_features = num_features
+    self.eps = eps
+    self.dtype = dtype
+    self.weight_dtype = weight_dtype
+    self.rms_norm = nnx.data(
+        RMSNorm(
+            num_features=num_features,
+            epsilon=eps,
+            dtype=dtype,
+            weight_dtype=weight_dtype,
+            scale_init=nnx.initializers.ones,
+            rngs=rngs,
+        )
+    )
+
+  def __call__(self, hidden_states: Array, gate: Array) -> Array:
+    """
+    Applies RMSNorm and then a SiLU gate.
+
+    Args:
+      hidden_states: The input array to be normalized (o). Shape: (..., F)
+      gate: The gating array for the activation (z). Shape: (..., F)
+            where F is num_features.
+
+    Returns:
+      The normalized and gated output array. Shape: (..., F)
+    """
+    normalized_states = self.rms_norm(hidden_states)
+
+    # Gated Activation using SiLU (Sigmoid-weighted Linear Unit)
+    gated_states = normalized_states * jax.nn.silu(gate.astype(jnp.float32))
+
+    return gated_states.astype(self.dtype)
 
 
 def rms_norm(
Original file line number	Diff line number	Diff line change
`@@ -45,3 +45,4 @@ gdn_chunk_size: 64`
`45`	`45`
`46`	`46`	`# RoPE Settings`
`47`	`47`	`rope_max_timescale: 10000000`
	`48`	`+partial_rotary_factor: 0.25`