From 2ad04eb972a35861f63e3bcc40a6f1ebcfb50f2f Mon Sep 17 00:00:00 2001 From: SrijanUpadhyay <159617011+SrijanUpadhyay@users.noreply.github.com> Date: Sun, 19 Oct 2025 15:42:35 +0000 Subject: [PATCH 1/2] [Training] Add DeepSpeed ZeRO-2 optimizations for H200 GPUs This commit adds configurations and setup scripts to resolve NCCL timeout issues during DeepSpeed ZeRO-2 training on H200 GPUs. The changes include: - Extended NCCL and DeepSpeed timeouts - Optimized bucket sizes for gradient communication - CPU and dataloader optimizations - System shared memory improvements - Enhanced debugging capabilities The implementation provides: 1. DeepSpeed ZeRO-2 configuration (ds_config_zero2.json) 2. Environment setup script (setup_training_env.sh) 3. Accelerate configuration (accelerate_config.yaml) These changes improve training stability on H200 GPUs with high-resolution data and aggressive configurations. --- examples/dreambooth/accelerate_config.yaml | 14 ++++++++ examples/dreambooth/ds_config_zero2.json | 38 ++++++++++++++++++++++ examples/dreambooth/setup_training_env.sh | 25 ++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 examples/dreambooth/accelerate_config.yaml create mode 100644 examples/dreambooth/ds_config_zero2.json create mode 100755 examples/dreambooth/setup_training_env.sh diff --git a/examples/dreambooth/accelerate_config.yaml b/examples/dreambooth/accelerate_config.yaml new file mode 100644 index 000000000000..30b1c1cb9ff0 --- /dev/null +++ b/examples/dreambooth/accelerate_config.yaml @@ -0,0 +1,14 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_config_file: ds_config_zero2.json + zero3_init_flag: false +distributed_type: DEEPSPEED +machine_rank: 0 +main_training_function: main +mixed_precision: fp16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +use_cpu: false \ No newline at end of file diff --git a/examples/dreambooth/ds_config_zero2.json b/examples/dreambooth/ds_config_zero2.json new file mode 100644 index 000000000000..41227d7011ea --- /dev/null +++ b/examples/dreambooth/ds_config_zero2.json @@ -0,0 +1,38 @@ +{ + "train_batch_size": 2, + "gradient_accumulation_steps": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu" + }, + "offload_param": { + "device": "cpu" + }, + "overlap_comm": true, + "contiguous_gradients": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 50000000 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 1e-5, + "betas": [0.9, 0.999], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 1e-5, + "warmup_num_steps": 100 + } + }, + "steps_per_print": 10, + "wall_clock_breakdown": false, + "communication_data_type": "fp16" +} \ No newline at end of file diff --git a/examples/dreambooth/setup_training_env.sh b/examples/dreambooth/setup_training_env.sh new file mode 100755 index 000000000000..a4d2752ab02d --- /dev/null +++ b/examples/dreambooth/setup_training_env.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Extend NCCL timeouts +export NCCL_SOCKET_TIMEOUT=7200000 +export DEEPSPEED_TIMEOUT=7200000 + +# Set CPU threading optimizations +export OMP_NUM_THREADS=1 +export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb=512 + +# Increase system shared memory limits +sudo sysctl -w kernel.shmmax=85899345920 +sudo sysctl -w kernel.shmall=2097152 + +# Enable NCCL debugging for diagnostics +export NCCL_DEBUG=INFO + +# Optional: Set NCCL topology optimization +# Uncomment if needed after checking nvidia-smi topo -m +# export NCCL_P2P_LEVEL=PHB + +# Persist changes to sysctl +echo "kernel.shmmax=85899345920" | sudo tee -a /etc/sysctl.conf +echo "kernel.shmall=2097152" | sudo tee -a /etc/sysctl.conf +sudo sysctl -p \ No newline at end of file From 3efb13e80bb37bba5cccb27b22d1512ca9830068 Mon Sep 17 00:00:00 2001 From: SrijanUpadhyay <159617011+SrijanUpadhyay@users.noreply.github.com> Date: Sun, 19 Oct 2025 16:47:38 +0000 Subject: [PATCH 2/2] Fix #12504: Suppress ComplexHalf warning in FreeU FFT operations When using FreeU with half-precision (torch.float16) models, PyTorch may emit UserWarnings about experimental ComplexHalf support during FFT operations. This change locally suppresses that specific warning in the fourier_filter function to avoid flooding user logs while preserving behavior. - Added warnings import - Added local warning suppression around fftn/ifftn calls when dtype is float16 - Only suppresses the specific ComplexHalf experimental warning --- src/diffusers/utils/torch_utils.py | 31 +++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py index a1ab8cda431f..b9edd203d087 100644 --- a/src/diffusers/utils/torch_utils.py +++ b/src/diffusers/utils/torch_utils.py @@ -18,6 +18,7 @@ import functools import os from typing import Callable, Dict, List, Optional, Tuple, Union +import warnings from . import logging from .import_utils import is_torch_available, is_torch_npu_available, is_torch_version @@ -222,8 +223,23 @@ def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.T x = x.to(dtype=torch.float32) # FFT - x_freq = fftn(x, dim=(-2, -1)) - x_freq = fftshift(x_freq, dim=(-2, -1)) + # When running with torch.float16, PyTorch may emit a UserWarning about + # ComplexHalf (experimental) support when performing FFTs. This warning is + # noisy for users of the FreeU feature and doesn't change the behaviour of + # the algorithm here. We therefore locally suppress that specific warning + # around the FFT calls when the input dtype is float16. + if x.dtype == torch.float16: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="ComplexHalf support is experimental and many operators don't support it yet.*", + category=UserWarning, + ) + x_freq = fftn(x, dim=(-2, -1)) + x_freq = fftshift(x_freq, dim=(-2, -1)) + else: + x_freq = fftn(x, dim=(-2, -1)) + x_freq = fftshift(x_freq, dim=(-2, -1)) B, C, H, W = x_freq.shape mask = torch.ones((B, C, H, W), device=x.device) @@ -234,7 +250,16 @@ def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.T # IFFT x_freq = ifftshift(x_freq, dim=(-2, -1)) - x_filtered = ifftn(x_freq, dim=(-2, -1)).real + if x.dtype == torch.float16: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="ComplexHalf support is experimental and many operators don't support it yet.*", + category=UserWarning, + ) + x_filtered = ifftn(x_freq, dim=(-2, -1)).real + else: + x_filtered = ifftn(x_freq, dim=(-2, -1)).real return x_filtered.to(dtype=x_in.dtype)