Merge pull request #2561 from AI-Hypercomputer:hengtaoguo-vl

Google-ML-Automation · Google-ML-Automation · commit b043824249c6 · 2025-11-03T19:53:28.000-08:00
PiperOrigin-RevId: 827750023
diff --git a/src/MaxText/configs/models/qwen3-omni-30b-a3b.yml b/src/MaxText/configs/models/qwen3-omni-30b-a3b.yml
@@ -0,0 +1,40 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Model config for Qwen3-Omni-30B-A3B
+
+# Core Architectural Parameters
+decoder_block: "qwen3_moe"
+base_emb_dim: 2048
+base_mlp_dim: 768
+base_num_query_heads: 32
+base_num_kv_heads: 4
+base_num_decoder_layers: 48
+head_dim: 128
+mlp_activations: ["silu", "linear"]
+vocab_size: 152064
+normalization_layer_epsilon: 1.0e-6
+use_qk_norm: True
+
+# MoE Specific Parameters
+num_experts: 128
+num_experts_per_tok: 8
+base_moe_mlp_dim: 768
+norm_topk_prob: true
+
+# RoPE Settings
+rope_max_timescale: 10_000_000
+
+# General Model Settings
+enable_dropout: False
diff --git a/src/MaxText/pyconfig.py b/src/MaxText/pyconfig.py
@@ -454,6 +454,7 @@ def validate_model_name(s: str) -> bool:
       "qwen3-30b-a3b",
       "qwen3-480b-a35b",
       "qwen3-next-80b-a3b",
+      "qwen3-omni-30b-a3b",
       "gpt3-175b",
       "gpt3-22b",
       "gpt3-6b",
diff --git a/src/MaxText/utils/ckpt_conversion/to_maxtext.py b/src/MaxText/utils/ckpt_conversion/to_maxtext.py
@@ -182,6 +182,18 @@ def _build_single_axis_stacked_tensor(
   return np.stack(tensors_to_stack, axis=axis_to_stack)
 
 
+def _get_hf_model(model_id: str, token: str):
+  """Loads the HuggingFace model based on model_id."""
+  # Some models require special classes to import
+  if model_id in ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]:
+    from transformers import Qwen3OmniMoeForConditionalGeneration  # pylint: disable=import-outside-toplevel
+
+    hf_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(model_id, token=token)
+  else:
+    hf_model = AutoModelForCausalLM.from_pretrained(model_id, token=token)
+  return hf_model
+
+
 def main(argv: Sequence[str]) -> None:
   jax.config.update("jax_default_prng_impl", "unsafe_rbg")
   os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"  # Suppress TensorFlow logging
@@ -217,7 +229,7 @@ def main(argv: Sequence[str]) -> None:
   # Load HuggingFace model, config, and state_dict
   max_logging.log(f"Loading HuggingFace model: {model_id}...")
   hf_config_obj = AutoConfig.from_pretrained(model_id, token=hf_token)
-  hf_model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_token)
+  hf_model = _get_hf_model(model_id, token=hf_token)
   hf_state_dict_numpy = hf_model.state_dict()
   for k, v in hf_state_dict_numpy.items():
     hf_state_dict_numpy[k] = v.numpy()
diff --git a/src/MaxText/utils/ckpt_conversion/utils/hf_model_configs.py b/src/MaxText/utils/ckpt_conversion/utils/hf_model_configs.py
@@ -469,6 +469,17 @@
     vocab_size=151936,
 )
 
+qwen3_omni_30b_a3b_config = transformers.Qwen3OmniMoeConfig(
+    # TODO(hengtaoguo): Pure-text Omni model, need to fill in visual/audio/code2wav parts
+    architectures=["Qwen3OmniMoeForConditionalGeneration"],
+    thinker_config={
+        "text_config": {
+            "num_hidden_layers": 48,
+            "num_experts": 128,
+        }
+    },
+)
+
 HF_MODEL_CONFIGS = {
     "gemma2-2b": gemma2_2b_config,
     "gemma2-9b": gemma2_9b_config,
@@ -489,4 +500,5 @@
     "qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config,
     "qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config,
     "qwen3-480b-a35b": qwen3_coder_480b_a35b_config,
+    "qwen3-omni-30b-a3b": qwen3_omni_30b_a3b_config,
 }
diff --git a/src/MaxText/utils/ckpt_conversion/utils/param_mapping.py b/src/MaxText/utils/ckpt_conversion/utils/param_mapping.py
@@ -814,6 +814,78 @@ def reshape_kernel(input_tensor, target_shape):
   return mapping
 
 
+def QWEN3_OMNI_MOE_MAXTEXT_TO_HF_PARAM_MAPPING(config, scan_layers=False):
+  """Returns mapping from MaxText to HuggingFace Qwen3-Omni weight paths.
+
+  This function combines mappings from different modalities (text, vision, audio, etc.)
+  into a unified parameter mapping for the multi-modal Qwen3-Omni model.
+
+  Args:
+    config (dict): Model configuration dictionary containing modality-specific configs.
+    scan_layers (bool, optional): Whether the model uses scanned layers. Defaults to False.
+
+  Returns:
+    dict: Combined mapping from all modalities.
+  """
+  # Collect all modality mappings
+  mapping = {}
+
+  # Text mapping with "thinker." prefix, reusing QWEN3-MOE mapping function
+  num_experts_text = config["thinker_config"]["text_config"].get("num_experts", 0)
+  n_layers_text = config["thinker_config"]["text_config"]["num_hidden_layers"]
+  text_mapping = QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING(
+      config={"num_hidden_layers": n_layers_text, "num_experts": num_experts_text}, scan_layers=scan_layers
+  )
+
+  # Add "thinker." prefix to text mapping values
+  for key, value in text_mapping.items():
+    text_mapping[key] = [f"thinker.{v}" for v in value] if isinstance(value, list) else f"thinker.{value}"
+  mapping.update(text_mapping)
+
+  # TODO(hengtaoguo): Add vision, audio, and other modality mappings here similarly
+  # mapping.update(vision_mapping), mapping.update(audio_mapping), etc.
+
+  return mapping
+
+
+def QWEN3_OMNI_MOE_MAXTEXT_TO_HF_PARAM_HOOK_FN(config, scan_layers=False, saving_to_hf=False):
+  """Creates parameter transformation functions for Qwen3-Omni.
+
+  This function provides a dictionary of transformation functions (hooks) for
+  converting Qwen3-Omni model parameters between MaxText and Hugging Face formats.
+  It handles embedding padding and kernel reshaping.
+
+  Args:
+    config (dict): Model configuration dictionary, including
+      'num_hidden_layers' and optionally 'num_experts'.
+    scan_layers (bool, optional): Whether the model uses scanned layers.
+      Defaults to False.
+    saving_to_hf (bool, optional): The direction of conversion. True for
+      MaxText to Hugging Face, False for the reverse. Defaults to False.
+
+  Returns:
+    dict: A dictionary mapping MaxText parameter names to their corresponding
+      transformation functions.
+  """
+  # Collect all modality hooks
+  mapping = {}
+
+  # Text hooks, reusing QWEN3-MOE hook function
+  num_experts_text = config["thinker_config"]["text_config"].get("num_experts", 0)
+  n_layers_text = config["thinker_config"]["text_config"]["num_hidden_layers"]
+  text_hooks = QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN(
+      config={"num_hidden_layers": n_layers_text, "num_experts": num_experts_text},
+      scan_layers=scan_layers,
+      saving_to_hf=saving_to_hf,
+  )
+  mapping.update(text_hooks)
+
+  # TODO(hengtaoguo): Add vision, audio, and other modality mappings here similarly
+  # mapping.update(vision_hooks), mapping.update(audio_hooks), etc.
+
+  return mapping
+
+
 def LLAMA31_MAXTEXT_TO_HF_PARAM_MAPPING(config, scan_layers=False):
   """
   Returns a dictionary mapping from MaxText parameter names to
@@ -1007,6 +1079,7 @@ def from_hf():
     "qwen3-30b-a3b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
     "qwen3-235b-a22b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
     "qwen3-coder-480b-a35b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-omni-30b-a3b": QWEN3_OMNI_MOE_MAXTEXT_TO_HF_PARAM_MAPPING,
 }
 
 HOOK_FNS = {
@@ -1028,4 +1101,5 @@ def from_hf():
     "qwen3-30b-a3b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "qwen3-235b-a22b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "qwen3-coder-480b-a35b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-omni-30b-a3b": QWEN3_OMNI_MOE_MAXTEXT_TO_HF_PARAM_HOOK_FN,
 }
diff --git a/src/MaxText/utils/ckpt_conversion/utils/utils.py b/src/MaxText/utils/ckpt_conversion/utils/utils.py
@@ -72,6 +72,7 @@
     "qwen3-30b-a3b": "Qwen/Qwen3-30B-A3B-Thinking-2507",
     "qwen3-235b-a22b": "Qwen/Qwen3-235B-A22B-Thinking-2507",
     "qwen3-480b-a35b": "Qwen/Qwen3-Coder-480B-A35B-Instruct",
+    "qwen3-omni-30b-a3b": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@`
`72`	`72`	`"qwen3-30b-a3b": "Qwen/Qwen3-30B-A3B-Thinking-2507",`
`73`	`73`	`"qwen3-235b-a22b": "Qwen/Qwen3-235B-A22B-Thinking-2507",`
`74`	`74`	`"qwen3-480b-a35b": "Qwen/Qwen3-Coder-480B-A35B-Instruct",`
	`75`	`+ "qwen3-omni-30b-a3b": "Qwen/Qwen3-Omni-30B-A3B-Instruct",`
`75`	`76`	`}`
`76`	`77`
`77`	`78`