Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions configs/multimodal/qwen3_vl/qwen3_vl_moe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ model:
attn_implementation: flash_attention_2

data:
train_path: sharegpt4v_pretrain
train_path: /path/to/ShareGPT4V-small-coco-128.jsonl
data_type: conversation
source_name: sharegpt4v_sft
chat_template: qwen2_5vl
max_seq_len: 4096
train_size: 80000000
dataloader_type: native
datasets_type: iterable # or "mapping" depending on your dataset size

train:
output_dir: qwen3_vl_moe_sft
Expand All @@ -25,7 +27,7 @@ train:
lr_decay_style: cosine
num_train_epochs: 2
micro_batch_size: 1
global_batch_size: 32
global_batch_size: 16
max_steps: 500
init_device: meta
enable_profiling: true
Expand Down
74 changes: 74 additions & 0 deletions docs/examples/qwen3vl_moe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Qwen3-VL MoE Training Guide

## 1. Download Qwen3-VL MoE Model

```shell
python3 scripts/download_hf_model.py \
--repo_id Qwen/Qwen3-VL-30B-A3B-Instruct \
--local_dir .
```

## 2. Merge Qwen3-VL MoE Model Experts (Optional)

If you want to use GroupGemm optimization for MoE experts, merge the model:

```shell
python3 scripts/moe_ckpt_merge/moe_merge.py \
--raw_hf_path Qwen3-VL-30B-A3B-Instruct \
--merge_hf_path Qwen3-VL-30B-A3B-Instruct-merge
```

Then update the `model_path` in your config to use the merged model.

## 3. Prepare Dataset

Download the [ShareGPT4V-small](https://github.com/iqiancheng/ShareGPT4V-small) dataset.

The dataset supports relative image paths relative to the `train_path` directory. For example, if your `train_path` is `/path/to/ShareGPT4V-small-coco-128.jsonl` and an image path in the dataset is `coco/train2017/image.jpg`, it will be automatically resolved to `/path/to/coco/train2017/image.jpg`.

## 4. Configure Training

Update the config file `configs/multimodal/qwen3_vl/qwen3_vl_moe.yaml`:

```yaml
model:
model_path: Qwen3-VL-30B-A3B-Instruct # or Qwen3-VL-30B-A3B-Instruct-merge if merged
moe_implementation: fused
attn_implementation: flash_attention_2

data:
train_path: /path/to/ShareGPT4V-small-coco-128.jsonl
data_type: conversation
source_name: sharegpt4v_sft
chat_template: qwen2_5vl
max_seq_len: 4096
dataloader_type: native
datasets_type: iterable

train:
output_dir: qwen3_vl_moe_sft
data_parallel_mode: fsdp2
enable_reentrant: false
use_wandb: true
wandb_project: qwen3_vl_moe
wandb_name: qwen3_vl_moe
rmpad: false
rmpad_with_pos_ids: true
expert_parallel_size: 1
freeze_vit: false
lr: 1.0e-5
lr_decay_style: cosine
num_train_epochs: 2
micro_batch_size: 1
global_batch_size: 16
max_steps: 500
init_device: meta
ckpt_manager: dcp
save_hf_weights: false
```

## 5. Train Qwen3-VL MoE Model

```shell
bash train.sh tasks/omni/train_qwen_vl.py configs/multimodal/qwen3_vl/qwen3_vl_moe.yaml
```
4 changes: 3 additions & 1 deletion tasks/omni/train_qwen_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class MyDataArguments(DataArguments):
@dataclass
class Arguments:
model: "ModelArguments" = field(default_factory=ModelArguments)
data: "DataArguments" = field(default_factory=DataArguments)
data: "MyDataArguments" = field(default_factory=MyDataArguments)
train: "MyTrainingArguments" = field(default_factory=MyTrainingArguments)


Expand Down Expand Up @@ -146,6 +146,7 @@ def main():
processor=processor,
chat_template=chat_template,
position_id_func=position_id_func,
train_path=args.data.train_path,
**args.data.mm_configs,
)
elif model_config.model_type in ("qwen3_vl", "qwen3_vl_moe"):
Expand All @@ -154,6 +155,7 @@ def main():
processor=processor,
chat_template=chat_template,
position_id_func=position_id_func,
train_path=args.data.train_path,
**args.data.mm_configs,
)
else:
Expand Down
5 changes: 4 additions & 1 deletion veomni/data/multimodal/audio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import numpy as np
import soundfile as sf

from .file_utils import resolve_relative_path


AudioInput = Union[
np.ndarray,
Expand Down Expand Up @@ -59,12 +61,13 @@ def load_audio_from_path(audio_path: str, sample_rate: int = 16000, **kwargs):
if audio_path.startswith("http://") or audio_path.startswith("https://"):
return librosa.load(audioread.ffdec.FFmpegAudioFile(audio_path), sr=sample_rate)[0]
else:
audio_path = resolve_relative_path(audio_path, kwargs.get("train_path"))
return librosa.load(audio_path, sr=sample_rate)[0]


def load_audio(audios: AudioInput, **kwargs):
if isinstance(audios, str):
return load_audio_from_path(audios)
return load_audio_from_path(audios, **kwargs)
elif isinstance(audios, bytes):
return load_audio_from_bytes(audios, **kwargs)
else:
Expand Down
32 changes: 32 additions & 0 deletions veomni/data/multimodal/file_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
from typing import Optional


def resolve_relative_path(file_path: str, train_path: Optional[str] = None) -> str:
"""
Resolve relative file path relative to train_path directory.

Args:
file_path: File path (can be absolute, relative, or URL)
train_path: Path to training data file or directory. If None, returns original path.

Returns:
Resolved absolute path if file_path is relative and train_path is provided,
otherwise returns original file_path.
"""
# Skip resolution for URLs or absolute paths
if file_path.startswith(("http://", "https://")) or os.path.isabs(file_path):
return file_path

# Resolve relative path relative to train_path directory
if train_path:
# Get the directory containing train_path (could be a file or directory)
if os.path.isfile(train_path):
train_dir = os.path.dirname(train_path)
else:
train_dir = train_path
# Resolve relative path
return os.path.join(train_dir, file_path)

return file_path

5 changes: 4 additions & 1 deletion veomni/data/multimodal/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import requests
from PIL import Image

from .file_utils import resolve_relative_path


ImageInput = Union[
Image.Image,
Expand Down Expand Up @@ -75,6 +77,7 @@ def load_image_from_path(image: str, **kwargs):
response = requests.get(image, stream=True)
image_obj = Image.open(BytesIO(response.content))
else:
image = resolve_relative_path(image, kwargs.get("train_path"))
image_obj = Image.open(image)
return image_obj.convert("RGB")

Expand All @@ -93,7 +96,7 @@ def load_image(image: ImageInput, **kwargs):


def fetch_images(images: List[ImageInput], **kwargs):
images = [load_image(image) for image in images]
images = [load_image(image, **kwargs) for image in images]
max_image_nums = kwargs.get("max_image_nums", len(images))
images = images[:max_image_nums]
images = [smart_resize(image, **kwargs) for image in images]
Expand Down
3 changes: 3 additions & 0 deletions veomni/data/multimodal/video_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from torchvision.transforms import InterpolationMode, functional

from ...utils import logging
from .file_utils import resolve_relative_path


logger = logging.get_logger(__name__)
Expand Down Expand Up @@ -138,6 +139,8 @@ def load_video_from_path(video: str, use_audio_in_video: bool = True, **kwargs):
logger.warning_once(
"torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0."
)
else:
video = resolve_relative_path(video, kwargs.get("train_path"))
video, _audio, info = torchvision.io.read_video(
video,
0.0,
Expand Down