diff --git a/examples/dreambooth/accelerate_config.yaml b/examples/dreambooth/accelerate_config.yaml new file mode 100644 index 000000000000..30b1c1cb9ff0 --- /dev/null +++ b/examples/dreambooth/accelerate_config.yaml @@ -0,0 +1,14 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_config_file: ds_config_zero2.json + zero3_init_flag: false +distributed_type: DEEPSPEED +machine_rank: 0 +main_training_function: main +mixed_precision: fp16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +use_cpu: false \ No newline at end of file diff --git a/examples/dreambooth/ds_config_zero2.json b/examples/dreambooth/ds_config_zero2.json new file mode 100644 index 000000000000..41227d7011ea --- /dev/null +++ b/examples/dreambooth/ds_config_zero2.json @@ -0,0 +1,38 @@ +{ + "train_batch_size": 2, + "gradient_accumulation_steps": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu" + }, + "offload_param": { + "device": "cpu" + }, + "overlap_comm": true, + "contiguous_gradients": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 50000000 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 1e-5, + "betas": [0.9, 0.999], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 1e-5, + "warmup_num_steps": 100 + } + }, + "steps_per_print": 10, + "wall_clock_breakdown": false, + "communication_data_type": "fp16" +} \ No newline at end of file diff --git a/examples/dreambooth/setup_training_env.sh b/examples/dreambooth/setup_training_env.sh new file mode 100755 index 000000000000..a4d2752ab02d --- /dev/null +++ b/examples/dreambooth/setup_training_env.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Extend NCCL timeouts +export NCCL_SOCKET_TIMEOUT=7200000 +export DEEPSPEED_TIMEOUT=7200000 + +# Set CPU threading optimizations +export OMP_NUM_THREADS=1 +export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb=512 + +# Increase system shared memory limits +sudo sysctl -w kernel.shmmax=85899345920 +sudo sysctl -w kernel.shmall=2097152 + +# Enable NCCL debugging for diagnostics +export NCCL_DEBUG=INFO + +# Optional: Set NCCL topology optimization +# Uncomment if needed after checking nvidia-smi topo -m +# export NCCL_P2P_LEVEL=PHB + +# Persist changes to sysctl +echo "kernel.shmmax=85899345920" | sudo tee -a /etc/sysctl.conf +echo "kernel.shmall=2097152" | sudo tee -a /etc/sysctl.conf +sudo sysctl -p \ No newline at end of file