AI-Hypercomputer
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 7 additions & 0 deletions b/‎src/MaxText/configs/base.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/MaxText/configs/models/qwen3-omni-30b-a3b.yml‎
Lines changed: 19 additions & 1 deletion b/‎src/MaxText/configs/models/qwen3-omni-30b-a3b.yml‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎src/MaxText/configs/types.py‎
Lines changed: 5 additions & 0 deletions b/‎src/MaxText/configs/types.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/MaxText/layers/attention_mla.py‎
Lines changed: 1 addition & 0 deletions b/‎src/MaxText/layers/attention_mla.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/MaxText/layers/attentions.py‎
Lines changed: 41 additions & 15 deletions b/‎src/MaxText/layers/attentions.py‎
Lines changed: 41 additions & 15 deletions
diff --git a/‎src/MaxText/layers/decoders.py‎
Lines changed: 8 additions & 1 deletion b/‎src/MaxText/layers/decoders.py‎
Lines changed: 8 additions & 1 deletion
@@ -897,6 +897,13 @@ vision_output_dim_for_vit: 4096
 pixel_shuffle_ratio_for_vit: 0.5
 projector_dropout_for_vit: 0.0
 
+# Qwen3-OmniMoe vision encoder
+spatial_merge_size_for_vit: 2
+out_hidden_size_for_vit: 512
+temporal_patch_size_for_vit: 2
+num_position_embeddings_for_vit: 1024
+deepstack_visual_indexes_for_vit: []
+
 # Subslice shape in the form of "x,y,z" when using pathways (single controller). 
 # Example: "8,8" to use a 8x8 subgrid (64 chips) of a full pod (16x16) of trillium.
 subslice_shape: ""
 
@@ -34,7 +34,25 @@ base_moe_mlp_dim: 768
 norm_topk_prob: true
 
 # RoPE Settings
-rope_max_timescale: 10_000_000
+rope_max_timescale: 1_000_000
+max_position_embeddings: 65536
 
 # General Model Settings
 enable_dropout: False
+
+# Vision Encoder Configuration
+# Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
+image_size_for_vit: 768
+hidden_size_for_vit: 1152
+intermediate_size_for_vit: 4304
+num_attention_heads_for_vit: 16
+num_hidden_layers_for_vit: 27
+num_channels_for_vit: 3
+patch_size_for_vit: 16
+temporal_patch_size_for_vit: 2
+spatial_merge_size_for_vit: 2
+out_hidden_size_for_vit: 2048
+num_position_embeddings_for_vit: 2304
+deepstack_visual_indexes_for_vit: [8, 16, 24]
+
+use_multimodal: true
@@ -1194,6 +1194,11 @@ class VisionTower(BaseModel):
   num_hidden_layers_for_vit: int = Field(34, description="Number of hidden layers in the Vision Transformer.")
   rope_theta_for_vit: int = Field(10000, description="RoPE theta value for the Vision Transformer.")
   vision_output_dim_for_vit: int = Field(4096, description="Final output dimension of the vision-to-language projection.")
+  spatial_merge_size_for_vit: int = Field(2, description="Spatial merge factor for vision patches.")
+  out_hidden_size_for_vit: int = Field(512, description="Output dimension of ViT.")
+  temporal_patch_size_for_vit: int = Field(2, description="Temporal patch size for video inputs.")
+  num_position_embeddings_for_vit: int = Field(1024, description="Number of position embeddings for ViT.")
+  deepstack_visual_indexes_for_vit: list[int] = Field([], description="Layer indices to extract deep visual features.")
 
 
 class VisionProjector(BaseModel):
 
@@ -671,6 +671,7 @@ def __call__(
       slot: Optional[int] = None,
       page_state: Optional[page_manager.PageState] = None,
       bidirectional_mask: Optional[Any] = None,
+      rope_kwargs: dict | None = None,
   ) -> Array:
     """Forward pass for MLA, reusing `AttentionOp` for the actual attention.
 
 
@@ -16,7 +16,7 @@
 
 import dataclasses
 import functools
-from typing import Any, Iterable, Optional, Tuple, Union
+from typing import Any, Iterable, Optional, Tuple, Union, cast
 
 from jax.ad_checkpoint import checkpoint_name
 from jax.sharding import Mesh, NamedSharding
@@ -63,6 +63,7 @@
 from MaxText.layers.embeddings import (
     LLaMARotaryEmbedding,
     LlamaVisionRotaryEmbedding,
+    Qwen3OmniMoeVisionRotaryEmbedding,
     RotaryEmbedding,
     YarnRotaryEmbedding,
     Qwen3NextRotaryEmbedding,
@@ -720,15 +721,29 @@ def init_rotary_embedding(self):
     rope_type = self.config.rope_type.lower()
     rope_use_scale = self.config.rope_use_scale
     if self.is_vision:
-      rotary_embedding = LlamaVisionRotaryEmbedding(
-          image_size=self.config.image_size_for_vit,
-          patch_size=self.config.patch_size_for_vit,
-          hidden_size=self.config.hidden_size_for_vit,
-          num_attention_heads=self.config.num_attention_heads_for_vit,
-          rope_theta=self.config.rope_theta_for_vit,
-          fprop_dtype=self.dtype,
-          rngs=self.rngs,
-      )
+      if self.config.model_name.startswith("qwen3-omni"):
+        rotary_embedding = Qwen3OmniMoeVisionRotaryEmbedding(
+            hidden_size=self.config.hidden_size_for_vit,
+            num_attention_heads=self.config.num_attention_heads_for_vit,
+            spatial_merge_size=self.config.spatial_merge_size_for_vit,
+            rope_theta=self.config.rope_theta_for_vit,
+            fprop_dtype=self.dtype,
+            rngs=self.rngs,
+        )
+      elif self.config.model_name.startswith("llama4"):
+        rotary_embedding = LlamaVisionRotaryEmbedding(
+            image_size=self.config.image_size_for_vit,
+            patch_size=self.config.patch_size_for_vit,
+            hidden_size=self.config.hidden_size_for_vit,
+            num_attention_heads=self.config.num_attention_heads_for_vit,
+            rope_theta=self.config.rope_theta_for_vit,
+            cast_as_fprop_dtype=True,
+            fprop_dtype=self.dtype,
+            rngs=self.rngs,
+        )
+      else:
+        raise ValueError(f"Unsupported model type for vision rotary embedding: {self.config.model_name}")
+
     elif self.config.model_name.startswith("llama3.1") or rope_type.startswith("llama3.1"):
       rotary_embedding = LLaMARotaryEmbedding(
           min_timescale=self.config.rope_min_timescale,
@@ -784,18 +799,28 @@ def init_rotary_embedding(self):
       )
     return rotary_embedding
 
-  def apply_rotary_embedding(self, inputs: Array, inputs_positions: Optional[Array | None] = None):
+  def apply_rotary_embedding(
+      self, inputs: Array, inputs_positions: Optional[Array | None] = None, rope_kwargs: dict | None = None
+  ):
     """Applies rotary embeddings, handling different model types.
 
     Args:
       inputs: The input tensor to apply rotary embeddings to.
       inputs_positions: The positions of the inputs.
-      name: A name for the embedding layer.
+      rope_kwargs: A dictionary of keyword arguments for the rotary embedding.
 
     Returns:
       The input tensor with rotary embeddings applied.
     """
-    return self.rotary_embedding(inputs, inputs_positions)
+    if isinstance(self.rotary_embedding, Qwen3OmniMoeVisionRotaryEmbedding):
+      # For Qwen3OmniMoe vision, pass static dimensions from kwargs.
+      num_frames = rope_kwargs.get("num_frames")
+      height = rope_kwargs.get("height")
+      width = rope_kwargs.get("width")
+      # Type cast required: Omni rotary embedding uses different __call__ parameters than other embeddings.
+      return cast(Qwen3OmniMoeVisionRotaryEmbedding, self.rotary_embedding)(inputs, num_frames, height, width)
+    else:
+      return self.rotary_embedding(inputs, inputs_positions)
 
   def init_kv_caches(self, inputs_kv_shape: Tuple):
     """Initializes KVCache.
@@ -878,6 +903,7 @@ def __call__(
       slot: Optional[int] = None,
       page_state: Optional[page_manager.PageState] = None,
       bidirectional_mask: Any = None,
+      rope_kwargs: dict | None = None,
   ):
     """Applies Attention on the input data.
 
@@ -952,8 +978,8 @@ def __call__(
     use_qk_norm = self.use_qk_norm and use_rope
 
     if use_rope:
-      query = self.apply_rotary_embedding(query, inputs_positions=inputs_positions)
-      key = self.apply_rotary_embedding(key, inputs_positions=inputs_positions)
+      query = self.apply_rotary_embedding(query, inputs_positions=inputs_positions, rope_kwargs=rope_kwargs)
+      key = self.apply_rotary_embedding(key, inputs_positions=inputs_positions, rope_kwargs=rope_kwargs)
 
     if use_qk_norm and is_llama4_decoder_block:
       l2_norm = L2Norm(eps=self.config.normalization_layer_epsilon)
 
@@ -558,7 +558,14 @@ def _apply_embedding(
 
     # Merge the image embeddings with the text embeddings for multimodal models
     if image_embeddings is not None and cfg.use_multimodal:
-      if cfg.model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b", "llama4-17b-16e", "llama4-17b-128e"]:
+      if cfg.model_name in [
+          "gemma3-4b",
+          "gemma3-12b",
+          "gemma3-27b",
+          "llama4-17b-16e",
+          "llama4-17b-128e",
+          "qwen3-omni-30b-a3b",
+      ]:
         y = multimodal_utils.merge_mm_embeddings(
             text_embeddings=y,
             vision_embeddings=image_embeddings,