Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions config/examples/train_lora_zimage_base_32gb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
---
# Z-Image Base - Character/Person LoRA training (32GB VRAM, e.g. RTX 5090)
# Best practices: Prodigy or Prodigy Schedule Free optimizer, batch_size 2 (or 4 with gradient_accumulation),
# DOP for identity preservation, 1024 resolution, linear rank 128. Replace folder_path and trigger/sample prompts.
job: extension
config:
name: "my_zimage_base_character_lora_v1"
process:
- type: 'diffusion_trainer'
training_folder: "output"
device: cuda:0
# LoRA: rank 128 good for character identity; use 64 if VRAM tight. Z-Image typically no conv training.
network:
type: "lora"
linear: 128
linear_alpha: 128
save:
dtype: bf16
save_every: 500
max_step_saves_to_keep: 6
save_format: safetensors
datasets:
- folder_path: "/path/to/images/folder"
caption_ext: "txt"
caption_dropout_rate: 0.05
cache_latents_to_disk: true
# 1024 matches Z-Image native; use [512, 768, 1024] for multi-res if preferred
resolution: [ 1024, 1024 ]
train:
batch_size: 2 # 32GB allows 2; Prodigy works well with larger batch. Try 4 or gradient_accumulation: 2 if headroom
gradient_accumulation: 1
steps: 3000 # 2500-3000 typical for character identity
train_unet: true
train_text_encoder: false
gradient_checkpointing: true
noise_scheduler: "flowmatch"
timestep_type: "weighted"
content_or_style: "balanced"
loss_type: "mse"
dtype: bf16
# Prodigy: nominal lr 1.0 (adaptive); use prodigy_schedulefree for schedule-free variant
optimizer: "prodigy"
lr: 1.0
optimizer_params:
weight_decay: 0.01
lr_scheduler: "constant"
# DOP: preserves model output without trigger, reduces overfitting for character LoRA
diff_output_preservation: true
diff_output_preservation_multiplier: 1.0
diff_output_preservation_class: "person"
switch_boundary_every: 1
unload_text_encoder: false
# cache_text_embeddings: true # optional, saves VRAM if using captions
ema_config:
use_ema: false
ema_decay: 0.99
skip_first_sample: false
disable_sampling: false
logging:
log_every: 1
use_ui_logger: true
model:
name_or_path: "Tongyi-MAI/Z-Image"
arch: "zimage"
quantize: true
qtype: "qfloat8"
quantize_te: true
qtype_te: "qfloat8"
low_vram: false # set true if OOM on 32GB
model_kwargs: {}
sample:
sampler: "flowmatch"
sample_every: 250
width: 1024
height: 1024
samples:
- prompt: "[trigger], studio portrait, soft lighting"
- prompt: "[trigger] on a beach, golden hour"
- prompt: "[trigger], casual outfit, urban background"
neg: ""
seed: 42
walk_seed: true
guidance_scale: 4 # Base uses CFG
sample_steps: 30
meta:
name: "[name]"
version: '1.0'
84 changes: 84 additions & 0 deletions config/examples/train_lora_zimage_turbo_32gb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
---
# Z-Image Turbo - Character/Person LoRA training (32GB VRAM, e.g. RTX 5090)
# Requires training adapter (assistant_lora_path) to avoid distilled quality loss. Use v2 adapter.
# Best practices: Prodigy or Prodigy Schedule Free, batch_size 2, DOP for identity, 1024 resolution, rank 128.
job: extension
config:
name: "my_zimage_turbo_character_lora_v1"
process:
- type: 'diffusion_trainer'
training_folder: "output"
device: cuda:0
network:
type: "lora"
linear: 128
linear_alpha: 128
save:
dtype: bf16
save_every: 500
max_step_saves_to_keep: 6
save_format: safetensors
datasets:
- folder_path: "/path/to/images/folder"
caption_ext: "txt"
caption_dropout_rate: 0.05
cache_latents_to_disk: true
resolution: [ 1024, 1024 ]
train:
batch_size: 2 # Prodigy works well with batch 2-4 on 32GB
gradient_accumulation: 1
steps: 3000
train_unet: true
train_text_encoder: false
gradient_checkpointing: true
noise_scheduler: "flowmatch"
timestep_type: "weighted"
content_or_style: "balanced"
loss_type: "mse"
dtype: bf16
optimizer: "prodigy" # or prodigy_schedulefree
lr: 1.0
optimizer_params:
weight_decay: 0.01
lr_scheduler: "constant"
diff_output_preservation: true # DOP for character identity
diff_output_preservation_multiplier: 1.0
diff_output_preservation_class: "person"
switch_boundary_every: 1
unload_text_encoder: false
ema_config:
use_ema: false
ema_decay: 0.99
skip_first_sample: false
disable_sampling: false
logging:
log_every: 1
use_ui_logger: true
model:
name_or_path: "Tongyi-MAI/Z-Image-Turbo"
arch: "zimage"
# Required for Turbo: training adapter prevents quality degradation from distilled model
assistant_lora_path: "ostris/zimage_turbo_training_adapter/zimage_turbo_training_adapter_v2.safetensors"
quantize: true
qtype: "qfloat8"
quantize_te: true
qtype_te: "qfloat8"
low_vram: false
model_kwargs: {}
sample:
sampler: "flowmatch"
sample_every: 250
width: 1024
height: 1024
samples:
- prompt: "[trigger], studio portrait, soft lighting"
- prompt: "[trigger] on a beach, golden hour"
- prompt: "[trigger], casual outfit, urban background"
neg: ""
seed: 42
walk_seed: true
guidance_scale: 1 # Turbo distilled: use 1
sample_steps: 8 # Turbo: fewer steps
meta:
name: "[name]"
version: '1.0'
1 change: 1 addition & 0 deletions dgx_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ k-diffusion
open_clip_torch
timm
prodigyopt
prodigy-plus-schedule-free
controlnet_aux==0.0.10
python-dotenv
bitsandbytes
Expand Down
13 changes: 13 additions & 0 deletions jobs/process/BaseSDTrainProcess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2180,6 +2180,9 @@ def run(self):
### HOOK ###
if self.torch_profiler is not None:
self.torch_profiler.start()
# Schedule-Free optimizers (e.g. Prodigy Schedule Free) need train() during training step
if hasattr(optimizer, 'train') and callable(optimizer.train):
optimizer.train()
did_oom = False
loss_dict = None
try:
Expand Down Expand Up @@ -2262,8 +2265,13 @@ def run(self):
# print above the progress bar
if self.progress_bar is not None:
self.progress_bar.pause()
# Schedule-Free: use averaged params for checkpoint
if hasattr(optimizer, 'eval') and callable(optimizer.eval):
optimizer.eval()
print_acc(f"\nSaving at step {self.step_num}")
self.save(self.step_num)
if hasattr(optimizer, 'train') and callable(optimizer.train):
optimizer.train()
self.ensure_params_requires_grad()
# clear any grads
optimizer.zero_grad()
Expand All @@ -2276,10 +2284,15 @@ def run(self):
if self.progress_bar is not None:
self.progress_bar.pause()
flush()
# Schedule-Free: use averaged params for sampling
if hasattr(optimizer, 'eval') and callable(optimizer.eval):
optimizer.eval()
# print above the progress bar
if self.train_config.free_u:
self.sd.pipeline.disable_freeu()
self.sample(self.step_num)
if hasattr(optimizer, 'train') and callable(optimizer.train):
optimizer.train()
if self.train_config.unload_text_encoder:
# make sure the text encoder is unloaded
self.sd.text_encoder_to('cpu')
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ k-diffusion
open_clip_torch
timm
prodigyopt
prodigy-plus-schedule-free
controlnet_aux==0.0.10
python-dotenv
bitsandbytes
Expand Down
14 changes: 14 additions & 0 deletions toolkit/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,20 @@ def get_optimizer(
# let net be the neural network you want to train
# you can choose weight decay value based on your problem, 0 by default
optimizer = Prodigy8bit(params, lr=use_lr, eps=1e-6, **optimizer_params)
elif (lower_type.startswith("prodigy_schedulefree") or
lower_type.replace("-", "_") == "prodigy_schedule_free"):
try:
from prodigyplus.prodigy_plus_schedulefree import ProdigyPlusScheduleFree
except ImportError:
raise ImportError(
"Prodigy Schedule Free requires: pip install prodigy-plus-schedule-free"
)
print("Using Prodigy + Schedule-Free optimizer")
use_lr = learning_rate
if use_lr < 0.1:
use_lr = 1.0
print(f"Using lr {use_lr}")
optimizer = ProdigyPlusScheduleFree(params, lr=use_lr, **optimizer_params)
elif lower_type.startswith("prodigy"):
from prodigyopt import Prodigy

Expand Down
4 changes: 4 additions & 0 deletions ui/src/app/jobs/new/SimpleJob.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,11 @@ export default function SimpleJob({
onChange={value => setJobConfig(value, 'config.process[0].train.optimizer')}
options={[
{ value: 'adamw8bit', label: 'AdamW8Bit' },
{ value: 'adamw', label: 'AdamW' },
{ value: 'adafactor', label: 'Adafactor' },
{ value: 'prodigy', label: 'Prodigy' },
{ value: 'prodigy8bit', label: 'Prodigy 8-bit' },
{ value: 'prodigy_schedulefree', label: 'Prodigy Schedule Free' },
]}
/>
<NumberInput
Expand Down