ByteDance-Seed · iqiancheng · Nov 14, 2025
diff --git a/configs/multimodal/qwen3_vl/qwen3_vl_moe.yaml b/configs/multimodal/qwen3_vl/qwen3_vl_moe.yaml
@@ -4,11 +4,13 @@ model:
   attn_implementation: flash_attention_2
 
 data:
-  train_path: sharegpt4v_pretrain
+  train_path: /path/to/ShareGPT4V-small-coco-128.jsonl
   data_type: conversation
+  source_name: sharegpt4v_sft
   chat_template: qwen2_5vl
   max_seq_len: 4096
-  train_size: 80000000
+  dataloader_type: native
+  datasets_type: iterable  # or "mapping" depending on your dataset size
 
 train:
   output_dir: qwen3_vl_moe_sft
@@ -25,7 +27,7 @@ train:
   lr_decay_style: cosine
   num_train_epochs: 2
   micro_batch_size: 1
-  global_batch_size: 32
+  global_batch_size: 16
   max_steps: 500
   init_device: meta
   enable_profiling: true

diff --git a/docs/examples/qwen3vl_moe.md b/docs/examples/qwen3vl_moe.md
@@ -0,0 +1,74 @@
+# Qwen3-VL MoE Training Guide
+
+## 1. Download Qwen3-VL MoE Model
+
+```shell
+python3 scripts/download_hf_model.py \
+  --repo_id Qwen/Qwen3-VL-30B-A3B-Instruct \
+  --local_dir .
+```
+
+## 2. Merge Qwen3-VL MoE Model Experts (Optional)
+
+If you want to use GroupGemm optimization for MoE experts, merge the model:
+
+```shell
+python3 scripts/moe_ckpt_merge/moe_merge.py \
+  --raw_hf_path Qwen3-VL-30B-A3B-Instruct \
+  --merge_hf_path Qwen3-VL-30B-A3B-Instruct-merge
+```
+
+Then update the `model_path` in your config to use the merged model.
+
+## 3. Prepare Dataset
+
+Download the [ShareGPT4V-small](https://github.com/iqiancheng/ShareGPT4V-small) dataset.
+
+The dataset supports relative image paths relative to the `train_path` directory. For example, if your `train_path` is `/path/to/ShareGPT4V-small-coco-128.jsonl` and an image path in the dataset is `coco/train2017/image.jpg`, it will be automatically resolved to `/path/to/coco/train2017/image.jpg`.
+
+## 4. Configure Training
+
+Update the config file `configs/multimodal/qwen3_vl/qwen3_vl_moe.yaml`:
+
+```yaml
+model:
+  model_path: Qwen3-VL-30B-A3B-Instruct  # or Qwen3-VL-30B-A3B-Instruct-merge if merged
+  moe_implementation: fused
+  attn_implementation: flash_attention_2
+
+data:
+  train_path: /path/to/ShareGPT4V-small-coco-128.jsonl
+  data_type: conversation
+  source_name: sharegpt4v_sft
+  chat_template: qwen2_5vl
+  max_seq_len: 4096
+  dataloader_type: native
+  datasets_type: iterable 
+
+train:
+  output_dir: qwen3_vl_moe_sft
+  data_parallel_mode: fsdp2
+  enable_reentrant: false
+  use_wandb: true
+  wandb_project: qwen3_vl_moe
+  wandb_name: qwen3_vl_moe
+  rmpad: false
+  rmpad_with_pos_ids: true
+  expert_parallel_size: 1
+  freeze_vit: false
+  lr: 1.0e-5
+  lr_decay_style: cosine
+  num_train_epochs: 2
+  micro_batch_size: 1
+  global_batch_size: 16
+  max_steps: 500
+  init_device: meta
+  ckpt_manager: dcp
+  save_hf_weights: false
+```
+
+## 5. Train Qwen3-VL MoE Model
+
+```shell
+bash train.sh tasks/omni/train_qwen_vl.py configs/multimodal/qwen3_vl/qwen3_vl_moe.yaml
+```
diff --git a/tasks/omni/train_qwen_vl.py b/tasks/omni/train_qwen_vl.py
@@ -91,7 +91,7 @@ class MyDataArguments(DataArguments):
 @dataclass
 class Arguments:
     model: "ModelArguments" = field(default_factory=ModelArguments)
-    data: "DataArguments" = field(default_factory=DataArguments)
+    data: "MyDataArguments" = field(default_factory=MyDataArguments)
     train: "MyTrainingArguments" = field(default_factory=MyTrainingArguments)
 
 
@@ -146,6 +146,7 @@ def main():
             processor=processor,
             chat_template=chat_template,
             position_id_func=position_id_func,
+            train_path=args.data.train_path,
             **args.data.mm_configs,
         )
     elif model_config.model_type in ("qwen3_vl", "qwen3_vl_moe"):
@@ -154,6 +155,7 @@ def main():
             processor=processor,
             chat_template=chat_template,
             position_id_func=position_id_func,
+            train_path=args.data.train_path,
             **args.data.mm_configs,
         )
     else:

diff --git a/veomni/data/multimodal/audio_utils.py b/veomni/data/multimodal/audio_utils.py
@@ -7,6 +7,8 @@
 import numpy as np
 import soundfile as sf
 
+from .file_utils import resolve_relative_path
+
 
 AudioInput = Union[
     np.ndarray,
@@ -59,12 +61,13 @@ def load_audio_from_path(audio_path: str, sample_rate: int = 16000, **kwargs):
     if audio_path.startswith("http://") or audio_path.startswith("https://"):
         return librosa.load(audioread.ffdec.FFmpegAudioFile(audio_path), sr=sample_rate)[0]
     else:
+        audio_path = resolve_relative_path(audio_path, kwargs.get("train_path"))
         return librosa.load(audio_path, sr=sample_rate)[0]
 
 
 def load_audio(audios: AudioInput, **kwargs):
     if isinstance(audios, str):
-        return load_audio_from_path(audios)
+        return load_audio_from_path(audios, **kwargs)
     elif isinstance(audios, bytes):
         return load_audio_from_bytes(audios, **kwargs)
     else:

diff --git a/veomni/data/multimodal/file_utils.py b/veomni/data/multimodal/file_utils.py
@@ -0,0 +1,32 @@
+import os
+from typing import Optional
+
+
+def resolve_relative_path(file_path: str, train_path: Optional[str] = None) -> str:
+    """
+    Resolve relative file path relative to train_path directory.
+
+    Args:
+        file_path: File path (can be absolute, relative, or URL)
+        train_path: Path to training data file or directory. If None, returns original path.
+
+    Returns:
+        Resolved absolute path if file_path is relative and train_path is provided,
+        otherwise returns original file_path.
+    """
+    # Skip resolution for URLs or absolute paths
+    if file_path.startswith(("http://", "https://")) or os.path.isabs(file_path):
+        return file_path
+
+    # Resolve relative path relative to train_path directory
+    if train_path:
+        # Get the directory containing train_path (could be a file or directory)
+        if os.path.isfile(train_path):
+            train_dir = os.path.dirname(train_path)
+        else:
+            train_dir = train_path
+        # Resolve relative path
+        return os.path.join(train_dir, file_path)
+
+    return file_path
+
diff --git a/veomni/data/multimodal/image_utils.py b/veomni/data/multimodal/image_utils.py
@@ -7,6 +7,8 @@
 import requests
 from PIL import Image
 
+from .file_utils import resolve_relative_path
+
 
 ImageInput = Union[
     Image.Image,
@@ -75,6 +77,7 @@ def load_image_from_path(image: str, **kwargs):
         response = requests.get(image, stream=True)
         image_obj = Image.open(BytesIO(response.content))
     else:
+        image = resolve_relative_path(image, kwargs.get("train_path"))
         image_obj = Image.open(image)
     return image_obj.convert("RGB")
 
@@ -93,7 +96,7 @@ def load_image(image: ImageInput, **kwargs):
 
 
 def fetch_images(images: List[ImageInput], **kwargs):
-    images = [load_image(image) for image in images]
+    images = [load_image(image, **kwargs) for image in images]
     max_image_nums = kwargs.get("max_image_nums", len(images))
     images = images[:max_image_nums]
     images = [smart_resize(image, **kwargs) for image in images]

diff --git a/veomni/data/multimodal/video_utils.py b/veomni/data/multimodal/video_utils.py
@@ -12,6 +12,7 @@
 from torchvision.transforms import InterpolationMode, functional
 
 from ...utils import logging
+from .file_utils import resolve_relative_path
 
 
 logger = logging.get_logger(__name__)
@@ -138,6 +139,8 @@ def load_video_from_path(video: str, use_audio_in_video: bool = True, **kwargs):
             logger.warning_once(
                 "torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0."
             )
+    else:
+        video = resolve_relative_path(video, kwargs.get("train_path"))
     video, _audio, info = torchvision.io.read_video(
         video,
         0.0,