fix hf_path

zhang-cheng09 · chanchzhang · commit d523b4d6125c · 2026-02-12T14:21:39.000+08:00
Signed-off-by: zhangcheng &lt;chzhang_bj@163.com&gt;
diff --git a/src/megatron/bridge/models/qwen_omni/context_parallel_utils.py b/src/megatron/bridge/models/qwen_omni/context_parallel_utils.py
@@ -1,4 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
+from typing import Optional
 
 from megatron.core import parallel_state as mpu
 
@@ -191,7 +206,7 @@ def backward(ctx, grad_output):
 
 
 def split_data_cp_rank(
-    val: torch.Tensor, cp_size: int, seq_dim: int, cp_rank: int = None
+    val: Optional[torch.Tensor], cp_size: int, seq_dim: int, cp_rank: int = None
 ):
     assert cp_size > 1
     assert 0 == val.shape[seq_dim] % (2 * cp_size), f"{val.shape=} {cp_size=}"
diff --git a/src/megatron/bridge/models/qwen_omni/modelling_qwen3_omni.py b/src/megatron/bridge/models/qwen_omni/modelling_qwen3_omni.py
@@ -118,25 +118,25 @@ def forward(
         **kwargs,
     ) -> torch.Tensor:
         return self.thinker(
-            input_ids,
-            input_features,
-            position_ids,
-            attention_mask,
-            labels,
-            loss_mask,
-            inference_params,
-            packed_seq_params,
-            extra_block_kwargs,
-            pixel_values,
-            pixel_values_videos,
-            image_grid_thw,
-            video_grid_thw,
-            image_input_mask,
-            video_input_mask,
-            feature_attention_mask,
-            audio_feature_lengths,
-            cp_img_num,
-            use_audio_in_video,
-            video_second_per_grid,
+            input_ids=input_ids,
+            input_features=input_features,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            loss_mask=loss_mask,
+            inference_params=inference_params,
+            packed_seq_params=packed_seq_params,
+            extra_block_kwargs=extra_block_kwargs,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            image_input_mask=image_input_mask,
+            video_input_mask=video_input_mask,
+            feature_attention_mask=feature_attention_mask,
+            audio_feature_lengths=audio_feature_lengths,
+            cp_img_num=cp_img_num,
+            use_audio_in_video=use_audio_in_video,
+            video_second_per_grid=video_second_per_grid,
             **kwargs,
         )
diff --git a/src/megatron/bridge/models/qwen_omni/qwen3_omni_bridge.py b/src/megatron/bridge/models/qwen_omni/qwen3_omni_bridge.py
@@ -12,17 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
-from typing import Dict, Mapping, Union
-
 import torch
-import torch.nn as nn
-from megatron.core import parallel_state
 from transformers import Qwen3OmniMoeForConditionalGeneration
 
 from megatron.bridge.models.conversion.mapping_registry import MegatronMappingRegistry
 from megatron.bridge.models.qwen_omni.modelling_qwen3_omni import Qwen3OmniMoeModel
-from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge, WeightConversionTask
+from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
 from megatron.bridge.models.conversion.param_mapping import (
     AutoMapping,
     ConcatenatedQKVMapping,
@@ -32,7 +27,6 @@
 )
 from megatron.bridge.models.hf_pretrained.vlm import PreTrainedVLM
 from megatron.bridge.models.qwen_omni.qwen3_omni_provider import Qwen3OmniMoeModelProvider
-from megatron.bridge.utils.common_utils import extract_expert_number_from_param
 
 
 @MegatronModelBridge.register_bridge(source=Qwen3OmniMoeForConditionalGeneration, target=Qwen3OmniMoeModel)
diff --git a/src/megatron/bridge/models/qwen_omni/qwen3_omni_provider.py b/src/megatron/bridge/models/qwen_omni/qwen3_omni_provider.py
@@ -45,7 +45,7 @@ class Qwen3OmniMoeModelProvider(Qwen3MoEModelProvider):
 
     pretrained_model_name: str = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
 
-    # Vision-specific token IDs matching Qwen3VL MoE configuration
+    # Vision-specific token IDs matching Qwen3-Omni-MoE configuration
     # Based on HuggingFace Qwen3-Omni-MoE configs
     # Token ID for image placeholder in text
     image_token_id: int = 151655
@@ -143,7 +143,7 @@ def finalize(self) -> None:
 
     def provide(self, pre_process=None, post_process=None, vp_stage=None):
         """
-        Provide a Qwen3VL MoE model instance with vision and language components.
+        Provide a Qwen3 Omni MoE model instance with vision and language components.
         """
         language_transformer_config = self
 
diff --git a/src/megatron/bridge/models/qwen_omni/thinker_model.py b/src/megatron/bridge/models/qwen_omni/thinker_model.py
@@ -7,6 +7,8 @@
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
@@ -139,7 +141,7 @@ def __init__(
             pg_collection=pg_collection,
         )
         assert len(vision_transformer_config.vision_config.deepstack_visual_indexes) < len(self.language_model.decoder.layers), (
-            "the deepstack_visual_embeds should on the first pp-stage",
+            f"the deepstack_visual_embeds should on the first pp-stage",
             f"got {len(vision_transformer_config.vision_config.deepstack_visual_indexes)} deepstack_visual_indexes, "
             f" {len(self.language_model.decoder.layers)} language model layers",
         )
@@ -208,7 +210,8 @@ def get_audio_features(
         else:
             audio_feature_lengths = None
 
-        feature_lens = audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+        # feature_lens = audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+        feature_lens = audio_feature_lengths
         audio_outputs = self.audio_model(
             input_features,
             feature_lens=feature_lens,
diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py
@@ -248,7 +248,7 @@ def qwen3_omni_30b_a3b_finetune_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs
     is_full_sft = peft_value is None or (isinstance(peft_value, str) and peft_value.lower() == "none")
 
     recommended_kwargs: Qwen3VLCommonKwargs = {
-        "hf_path": "../hf-hub/Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        "hf_path": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
         "tensor_model_parallel_size": 1,
         "pipeline_model_parallel_size": 1,
         "pipeline_dtype": torch.bfloat16,