diff --git a/diffsynth/core/device/__init__.py b/diffsynth/core/device/__init__.py index e53364f4c..8373471cf 100644 --- a/diffsynth/core/device/__init__.py +++ b/diffsynth/core/device/__init__.py @@ -1 +1,2 @@ -from .npu_compatible_device import parse_device_type, parse_nccl_backend, get_available_device_type \ No newline at end of file +from .npu_compatible_device import parse_device_type, parse_nccl_backend, get_available_device_type, get_device_name +from .npu_compatible_device import IS_NPU_AVAILABLE diff --git a/diffsynth/core/vram/layers.py b/diffsynth/core/vram/layers.py index 751792d07..0f99b0d16 100644 --- a/diffsynth/core/vram/layers.py +++ b/diffsynth/core/vram/layers.py @@ -2,7 +2,7 @@ from typing import Union from .initialization import skip_model_initialization from .disk_map import DiskMap -from ..device import parse_device_type +from ..device import parse_device_type, get_device_name, IS_NPU_AVAILABLE class AutoTorchModule(torch.nn.Module): @@ -63,7 +63,7 @@ def cast_to(self, weight, dtype, device): return r def check_free_vram(self): - device = self.computation_device if self.computation_device != "npu" else "npu:0" + device = self.computation_device if not IS_NPU_AVAILABLE else get_device_name() gpu_mem_state = getattr(torch, self.computation_device_type).mem_get_info(device) used_memory = (gpu_mem_state[1] - gpu_mem_state[0]) / (1024**3) return used_memory < self.vram_limit diff --git a/diffsynth/diffusion/base_pipeline.py b/diffsynth/diffusion/base_pipeline.py index fa355a163..b4e79c0d8 100644 --- a/diffsynth/diffusion/base_pipeline.py +++ b/diffsynth/diffusion/base_pipeline.py @@ -7,6 +7,7 @@ from ..utils.lora import GeneralLoRALoader from ..models.model_loader import ModelPool from ..utils.controlnet import ControlNetInput +from ..core.device import get_device_name, IS_NPU_AVAILABLE class PipelineUnit: @@ -177,7 +178,7 @@ def generate_noise(self, shape, seed=None, rand_device="cpu", rand_torch_dtype=t def get_vram(self): - device = self.device if self.device != "npu" else "npu:0" + device = self.device if not IS_NPU_AVAILABLE else get_device_name() return getattr(torch, self.device_type).mem_get_info(device)[1] / (1024 ** 3) def get_module(self, model, name): diff --git a/docs/en/Pipeline_Usage/GPU_support.md b/docs/en/Pipeline_Usage/GPU_support.md index 789d26a47..6c27de778 100644 --- a/docs/en/Pipeline_Usage/GPU_support.md +++ b/docs/en/Pipeline_Usage/GPU_support.md @@ -13,7 +13,7 @@ All sample code provided by this project supports NVIDIA GPUs by default, requir AMD provides PyTorch packages based on ROCm, so most models can run without code changes. A small number of models may not be compatible due to their reliance on CUDA-specific instructions. ## Ascend NPU - +### Inference When using Ascend NPU, you need to replace `"cuda"` with `"npu"` in your code. For example, here is the inference code for **Wan2.1-T2V-1.3B**, modified for Ascend NPU: @@ -22,6 +22,7 @@ For example, here is the inference code for **Wan2.1-T2V-1.3B**, modified for As import torch from diffsynth.utils.data import save_video, VideoData from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig +from diffsynth.core.device.npu_compatible_device import get_device_name vram_config = { "offload_dtype": "disk", @@ -46,7 +47,7 @@ pipe = WanVideoPipeline.from_pretrained( ], tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), - vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2, -+ vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2, ++ vram_limit=torch.npu.mem_get_info(get_device_name())[1] / (1024 ** 3) - 2, ) video = pipe( @@ -56,3 +57,28 @@ video = pipe( ) save_video(video, "video.mp4", fps=15, quality=5) ``` + +### Training +NPU startup script samples have been added for each type of model,the scripts are stored in the `examples/xxx/special/npu_scripts`, for example `examples/wanvideo/model_training/special/npu_scripts/Wan2.2-T2V-A14B-NPU.sh`. + +In the NPU training scripts, NPU specific environment variables that can optimize performance have been added, and relevant parameters have been enabled for specific models. + +#### Environment variables +```shell +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +``` +`expandable_segments:`: Enable the memory pool expansion segment function, which is the virtual memory feature. + +```shell +export CPU_AFFINITY_CONF=1 +``` +Set 0 or not set: indicates not enabling the binding function + +1: Indicates enabling coarse-grained kernel binding + +2: Indicates enabling fine-grained kernel binding + +#### Parameters for specific models +| Model | Parameter | Note | +|----------------|---------------------------|-------------------| +| Wan 14B series | --initialize_model_on_cpu | The 14B model needs to be initialized on the CPU | \ No newline at end of file diff --git a/docs/zh/Pipeline_Usage/GPU_support.md b/docs/zh/Pipeline_Usage/GPU_support.md index 56d78f785..b955f5600 100644 --- a/docs/zh/Pipeline_Usage/GPU_support.md +++ b/docs/zh/Pipeline_Usage/GPU_support.md @@ -13,7 +13,7 @@ AMD 提供了基于 ROCm 的 torch 包,所以大多数模型无需修改代码即可运行,少数模型由于依赖特定的 cuda 指令无法运行。 ## Ascend NPU - +### 推理 使用 Ascend NPU 时,需把代码中的 `"cuda"` 改为 `"npu"`。 例如,Wan2.1-T2V-1.3B 的推理代码: @@ -22,6 +22,7 @@ AMD 提供了基于 ROCm 的 torch 包,所以大多数模型无需修改代码 import torch from diffsynth.utils.data import save_video, VideoData from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig +from diffsynth.core.device.npu_compatible_device import get_device_name vram_config = { "offload_dtype": "disk", @@ -33,7 +34,7 @@ vram_config = { + "preparing_device": "npu", "computation_dtype": torch.bfloat16, - "computation_device": "cuda", -+ "preparing_device": "npu", ++ "computation_device": "npu", } pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, @@ -46,7 +47,7 @@ pipe = WanVideoPipeline.from_pretrained( ], tokenizer_config=ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/umt5-xxl/"), - vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 2, -+ vram_limit=torch.npu.mem_get_info("npu:0")[1] / (1024 ** 3) - 2, ++ vram_limit=torch.npu.mem_get_info(get_device_name())[1] / (1024 ** 3) - 2, ) video = pipe( @@ -56,3 +57,28 @@ video = pipe( ) save_video(video, "video.mp4", fps=15, quality=5) ``` + +### 训练 +当前已为每类模型添加NPU的启动脚本样例,脚本存放在`examples/xxx/special/npu_scripts`目录下,例如 `examples/wanvideo/model_training/special/npu_scripts/Wan2.2-T2V-A14B-NPU.sh`。 + +在NPU训练脚本中,添加了可以优化性能的NPU特有环境变量,并针对特定模型开启了相关参数。 + +#### 环境变量 +```shell +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +``` +`expandable_segments:`: 使能内存池扩展段功能,即虚拟内存特征。 + +```shell +export CPU_AFFINITY_CONF=1 +``` +设置0或未设置: 表示不启用绑核功能 + +1: 表示开启粗粒度绑核 + +2: 表示开启细粒度绑核 + +#### 特定模型需要开启的参数 +| 模型 | 参数 | 备注 | +|-----------|------|-------------------| +| Wan 14B系列 | --initialize_model_on_cpu | 14B模型需要在cpu上进行初始化 | \ No newline at end of file diff --git a/examples/flux/model_training/special/npu_training/FLUX.1-Kontext-dev-NPU.sh b/examples/flux/model_training/special/npu_training/FLUX.1-Kontext-dev-NPU.sh new file mode 100644 index 000000000..7ec976d35 --- /dev/null +++ b/examples/flux/model_training/special/npu_training/FLUX.1-Kontext-dev-NPU.sh @@ -0,0 +1,17 @@ +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export CPU_AFFINITY_CONF=1 + +accelerate launch --config_file examples/flux/model_training/full/accelerate_config_zero2offload.yaml examples/flux/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata_kontext.csv \ + --data_file_keys "image,kontext_images" \ + --max_pixels 1048576 \ + --dataset_repeat 400 \ + --model_id_with_origin_paths "black-forest-labs/FLUX.1-Kontext-dev:flux1-kontext-dev.safetensors,black-forest-labs/FLUX.1-dev:text_encoder/model.safetensors,black-forest-labs/FLUX.1-dev:text_encoder_2/*.safetensors,black-forest-labs/FLUX.1-dev:ae.safetensors" \ + --learning_rate 1e-5 \ + --num_epochs 1 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/FLUX.1-Kontext-dev_full" \ + --trainable_models "dit" \ + --extra_inputs "kontext_images" \ + --use_gradient_checkpointing diff --git a/examples/flux/model_training/special/npu_training/FLUX.1-dev-NPU.sh b/examples/flux/model_training/special/npu_training/FLUX.1-dev-NPU.sh new file mode 100644 index 000000000..813359448 --- /dev/null +++ b/examples/flux/model_training/special/npu_training/FLUX.1-dev-NPU.sh @@ -0,0 +1,15 @@ +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export CPU_AFFINITY_CONF=1 + +accelerate launch --config_file examples/flux/model_training/full/accelerate_config_zero2offload.yaml examples/flux/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata.csv \ + --max_pixels 1048576 \ + --dataset_repeat 400 \ + --model_id_with_origin_paths "black-forest-labs/FLUX.1-dev:flux1-dev.safetensors,black-forest-labs/FLUX.1-dev:text_encoder/model.safetensors,black-forest-labs/FLUX.1-dev:text_encoder_2/*.safetensors,black-forest-labs/FLUX.1-dev:ae.safetensors" \ + --learning_rate 1e-5 \ + --num_epochs 1 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/FLUX.1-dev_full" \ + --trainable_models "dit" \ + --use_gradient_checkpointing diff --git a/examples/qwen_image/model_training/special/npu_training/Qwen-Image-Edit-2509-LoRA-NPU.sh b/examples/qwen_image/model_training/special/npu_training/Qwen-Image-Edit-2509-LoRA-NPU.sh new file mode 100644 index 000000000..9c3f02c92 --- /dev/null +++ b/examples/qwen_image/model_training/special/npu_training/Qwen-Image-Edit-2509-LoRA-NPU.sh @@ -0,0 +1,38 @@ +# Due to memory limitations, split training is required to train the model on NPU +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export CPU_AFFINITY_CONF=1 + +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata.csv \ + --max_pixels 1048576 \ + --dataset_repeat 1 \ + --model_id_with_origin_paths "Qwen/Qwen-Image-Edit-2509:text_encoder/model*.safetensors,Qwen/Qwen-Image-Edit-2509:vae/diffusion_pytorch_model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-Edit-2509-LoRA-splited-cache" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --use_gradient_checkpointing \ + --dataset_num_workers 8 \ + --find_unused_parameters \ + --task "sft:data_process" + +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path "./models/train/Qwen-Image-Edit-2509-LoRA-splited-cache" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image-Edit-2509:transformer/diffusion_pytorch_model*.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-Edit-2509-LoRA-splited" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --use_gradient_checkpointing \ + --dataset_num_workers 8 \ + --find_unused_parameters \ + --task "sft:train" diff --git a/examples/qwen_image/model_training/special/npu_training/Qwen-Image-LoRA-NPU.sh b/examples/qwen_image/model_training/special/npu_training/Qwen-Image-LoRA-NPU.sh new file mode 100644 index 000000000..08978c074 --- /dev/null +++ b/examples/qwen_image/model_training/special/npu_training/Qwen-Image-LoRA-NPU.sh @@ -0,0 +1,38 @@ +# Due to memory limitations, split training is required to train the model on NPU +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export CPU_AFFINITY_CONF=1 + +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata.csv \ + --max_pixels 1048576 \ + --dataset_repeat 1 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:text_encoder/model*.safetensors,Qwen/Qwen-Image:vae/diffusion_pytorch_model.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-LoRA-splited-cache" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --use_gradient_checkpointing \ + --dataset_num_workers 8 \ + --find_unused_parameters \ + --task "sft:data_process" + +accelerate launch examples/qwen_image/model_training/train.py \ + --dataset_base_path "./models/train/Qwen-Image-LoRA-splited-cache" \ + --max_pixels 1048576 \ + --dataset_repeat 50 \ + --model_id_with_origin_paths "Qwen/Qwen-Image:transformer/diffusion_pytorch_model*.safetensors" \ + --learning_rate 1e-4 \ + --num_epochs 5 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Qwen-Image-LoRA-splited" \ + --lora_base_model "dit" \ + --lora_target_modules "to_q,to_k,to_v,add_q_proj,add_k_proj,add_v_proj,to_out.0,to_add_out,img_mlp.net.2,img_mod.1,txt_mlp.net.2,txt_mod.1" \ + --lora_rank 32 \ + --use_gradient_checkpointing \ + --dataset_num_workers 8 \ + --find_unused_parameters \ + --task "sft:train" diff --git a/examples/wanvideo/model_training/special/npu_training/Wan2.1-T2V-14B-NPU.sh b/examples/wanvideo/model_training/special/npu_training/Wan2.1-T2V-14B-NPU.sh new file mode 100644 index 000000000..ac2d9dd27 --- /dev/null +++ b/examples/wanvideo/model_training/special/npu_training/Wan2.1-T2V-14B-NPU.sh @@ -0,0 +1,16 @@ +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export CPU_AFFINITY_CONF=1 + +accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ + --dataset_base_path data/example_video_dataset \ + --dataset_metadata_path data/example_video_dataset/metadata.csv \ + --height 480 \ + --width 832 \ + --dataset_repeat 100 \ + --model_id_with_origin_paths "Wan-AI/Wan2.1-T2V-14B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.1-T2V-14B:Wan2.1_VAE.pth" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Wan2.1-T2V-14B_full" \ + --trainable_models "dit" \ + --initialize_model_on_cpu \ No newline at end of file diff --git a/examples/wanvideo/model_training/special/npu_training/Wan2.2-T2V-A14B-NPU.sh b/examples/wanvideo/model_training/special/npu_training/Wan2.2-T2V-A14B-NPU.sh new file mode 100644 index 000000000..4748f8728 --- /dev/null +++ b/examples/wanvideo/model_training/special/npu_training/Wan2.2-T2V-A14B-NPU.sh @@ -0,0 +1,38 @@ +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export CPU_AFFINITY_CONF=1 + +accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ + --dataset_base_path data/example_video_dataset \ + --dataset_metadata_path data/example_video_dataset/metadata.csv \ + --height 480 \ + --width 832 \ + --num_frames 49 \ + --dataset_repeat 100 \ + --model_id_with_origin_paths "Wan-AI/Wan2.2-T2V-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-T2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-T2V-A14B:Wan2.1_VAE.pth" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Wan2.2-T2V-A14B_high_noise_full" \ + --trainable_models "dit" \ + --max_timestep_boundary 0.417 \ + --min_timestep_boundary 0 \ + --initialize_model_on_cpu +# boundary corresponds to timesteps [875, 1000] + +accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ + --dataset_base_path data/example_video_dataset \ + --dataset_metadata_path data/example_video_dataset/metadata.csv \ + --height 480 \ + --width 832 \ + --num_frames 49 \ + --dataset_repeat 100 \ + --model_id_with_origin_paths "Wan-AI/Wan2.2-T2V-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.2-T2V-A14B:models_t5_umt5-xxl-enc-bf16.pth,Wan-AI/Wan2.2-T2V-A14B:Wan2.1_VAE.pth" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Wan2.2-T2V-A14B_low_noise_full" \ + --trainable_models "dit" \ + --max_timestep_boundary 1 \ + --min_timestep_boundary 0.417 \ + --initialize_model_on_cpu +# boundary corresponds to timesteps [0, 875) \ No newline at end of file diff --git a/examples/wanvideo/model_training/special/npu_training/Wan2.2-VACE-Fun-A14B-NPU.sh b/examples/wanvideo/model_training/special/npu_training/Wan2.2-VACE-Fun-A14B-NPU.sh new file mode 100644 index 000000000..304d53d7c --- /dev/null +++ b/examples/wanvideo/model_training/special/npu_training/Wan2.2-VACE-Fun-A14B-NPU.sh @@ -0,0 +1,45 @@ +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export CPU_AFFINITY_CONF=1 + +accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ + --dataset_base_path data/example_video_dataset \ + --dataset_metadata_path data/example_video_dataset/metadata_vace.csv \ + --data_file_keys "video,vace_video,vace_reference_image" \ + --height 480 \ + --width 832 \ + --num_frames 17 \ + --dataset_repeat 100 \ + --model_id_with_origin_paths "PAI/Wan2.2-VACE-Fun-A14B:high_noise_model/diffusion_pytorch_model*.safetensors,PAI/Wan2.2-VACE-Fun-A14B:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.2-VACE-Fun-A14B:Wan2.1_VAE.pth" \ + --learning_rate 1e-4 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.vace." \ + --output_path "./models/train/Wan2.2-VACE-Fun-A14B_high_noise_full" \ + --trainable_models "vace" \ + --extra_inputs "vace_video,vace_reference_image" \ + --use_gradient_checkpointing_offload \ + --max_timestep_boundary 0.358 \ + --min_timestep_boundary 0 \ + --initialize_model_on_cpu +# boundary corresponds to timesteps [900, 1000] + + +accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \ + --dataset_base_path data/example_video_dataset \ + --dataset_metadata_path data/example_video_dataset/metadata_vace.csv \ + --data_file_keys "video,vace_video,vace_reference_image" \ + --height 480 \ + --width 832 \ + --num_frames 17 \ + --dataset_repeat 100 \ + --model_id_with_origin_paths "PAI/Wan2.2-VACE-Fun-A14B:low_noise_model/diffusion_pytorch_model*.safetensors,PAI/Wan2.2-VACE-Fun-A14B:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.2-VACE-Fun-A14B:Wan2.1_VAE.pth" \ + --learning_rate 1e-4 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.vace." \ + --output_path "./models/train/Wan2.2-VACE-Fun-A14B_low_noise_full" \ + --trainable_models "vace" \ + --extra_inputs "vace_video,vace_reference_image" \ + --use_gradient_checkpointing_offload \ + --max_timestep_boundary 1 \ + --min_timestep_boundary 0.358 \ + --initialize_model_on_cpu +# boundary corresponds to timesteps [0, 900] \ No newline at end of file diff --git a/examples/z_image/model_training/special/npu_training/Z-Image-Turbo-NPU.sh b/examples/z_image/model_training/special/npu_training/Z-Image-Turbo-NPU.sh new file mode 100644 index 000000000..93cc645d9 --- /dev/null +++ b/examples/z_image/model_training/special/npu_training/Z-Image-Turbo-NPU.sh @@ -0,0 +1,16 @@ +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export CPU_AFFINITY_CONF=1 + +accelerate launch --config_file examples/z_image/model_training/full/accelerate_config.yaml examples/z_image/model_training/train.py \ + --dataset_base_path data/example_image_dataset \ + --dataset_metadata_path data/example_image_dataset/metadata.csv \ + --max_pixels 1048576 \ + --dataset_repeat 400 \ + --model_id_with_origin_paths "Tongyi-MAI/Z-Image-Turbo:transformer/*.safetensors,Tongyi-MAI/Z-Image-Turbo:text_encoder/*.safetensors,Tongyi-MAI/Z-Image-Turbo:vae/diffusion_pytorch_model.safetensors" \ + --learning_rate 1e-5 \ + --num_epochs 2 \ + --remove_prefix_in_ckpt "pipe.dit." \ + --output_path "./models/train/Z-Image-Turbo_full" \ + --trainable_models "dit" \ + --use_gradient_checkpointing \ + --dataset_num_workers 8