|
| 1 | +OMP_NUM_THREADS=14 \ |
| 2 | +MAX_PIXELS=1003520 \ |
| 3 | +VIDEO_MAX_PIXELS=50176 \ |
| 4 | +FPS_MAX_FRAMES=12 \ |
| 5 | +swift export \ |
| 6 | + --model Qwen/Qwen2.5-Omni-7B \ |
| 7 | + --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \ |
| 8 | + 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \ |
| 9 | + 'speech_asr/speech_asr_aishell1_trainsets:validation#5000' \ |
| 10 | + --max_length 4096 \ |
| 11 | + --split_dataset_ratio 0.01 \ |
| 12 | + --dataset_num_proc 16 \ |
| 13 | + --to_cached_dataset true \ |
| 14 | + --lazy_tokenize false \ |
| 15 | + --output_dir ./qwen2_5_omni_cached_dataset |
| 16 | + |
| 17 | +# 4 * 70GiB |
| 18 | +PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \ |
| 19 | +MAX_PIXELS=1003520 \ |
| 20 | +VIDEO_MAX_PIXELS=50176 \ |
| 21 | +FPS_MAX_FRAMES=12 \ |
| 22 | +NPROC_PER_NODE=4 \ |
| 23 | +ENABLE_AUDIO_OUTPUT=0 \ |
| 24 | +CUDA_VISIBLE_DEVICES=0,1,2,3 \ |
| 25 | +swift sft \ |
| 26 | + --model Qwen/Qwen2.5-Omni-7B \ |
| 27 | + --train_type full \ |
| 28 | + --cached_dataset './qwen2_5_omni_cached_dataset' \ |
| 29 | + --num_train_epochs 1 \ |
| 30 | + --split_dataset_ratio 0.01 \ |
| 31 | + --torch_dtype bfloat16 \ |
| 32 | + --per_device_train_batch_size 1 \ |
| 33 | + --per_device_eval_batch_size 1 \ |
| 34 | + --learning_rate 1e-5 \ |
| 35 | + --gradient_accumulation_steps 1 \ |
| 36 | + --packing true \ |
| 37 | + --freeze_llm false \ |
| 38 | + --freeze_vit true \ |
| 39 | + --freeze_aligner true \ |
| 40 | + --eval_steps 200 \ |
| 41 | + --save_steps 200 \ |
| 42 | + --logging_steps 5 \ |
| 43 | + --max_length 4096 \ |
| 44 | + --warmup_ratio 0.05 \ |
| 45 | + --dataloader_num_workers 8 \ |
| 46 | + --dataset_num_proc 8 \ |
| 47 | + --save_total_limit 2 \ |
| 48 | + --save_only_model true \ |
| 49 | + --output_dir output/Qwen2.5-Omni-7B \ |
| 50 | + --deepspeed zero2 \ |
| 51 | + --use_liger_kernel true \ |
| 52 | + --attn_impl flash_attn |
| 53 | + |
| 54 | +# Use the validation set |
| 55 | +CUDA_VISIBLE_DEVICES=0 \ |
| 56 | +MAX_PIXELS=1003520 \ |
| 57 | +VIDEO_MAX_PIXELS=50176 \ |
| 58 | +FPS_MAX_FRAMES=12 \ |
| 59 | +ENABLE_AUDIO_OUTPUT=0 \ |
| 60 | +swift infer \ |
| 61 | + --model output/Qwen2.5-Omni-7B/vx-xxx/checkpoint-xxx \ |
| 62 | + --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \ |
| 63 | + 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \ |
| 64 | + 'speech_asr/speech_asr_aishell1_trainsets:validation#5000' \ |
| 65 | + --max_length 4096 \ |
| 66 | + --split_dataset_ratio 0.01 \ |
| 67 | + --attn_impl flash_attn \ |
| 68 | + --stream true \ |
| 69 | + --temperature 0 \ |
| 70 | + --max_new_tokens 512 |
0 commit comments