feat: enable quantization for hidream lora training.

sayakpaul · sayakpaul · commit 8cef82c074c2 · 2025-05-05T18:00:29.000+05:30
diff --git a/examples/dreambooth/train_dreambooth_lora_hidream.py b/examples/dreambooth/train_dreambooth_lora_hidream.py
@@ -16,6 +16,7 @@
 import argparse
 import copy
 import itertools
+import json
 import logging
 import math
 import os
@@ -27,14 +28,13 @@
 
 import numpy as np
 import torch
-import torch.utils.checkpoint
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
 from huggingface_hub import create_repo, upload_folder
 from huggingface_hub.utils import insecure_hashlib
-from peft import LoraConfig, set_peft_model_state_dict
+from peft import LoraConfig, prepare_model_for_kbit_training, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
 from PIL import Image
 from PIL.ImageOps import exif_transpose
@@ -47,6 +47,7 @@
 import diffusers
 from diffusers import (
     AutoencoderKL,
+    BitsAndBytesConfig,
     FlowMatchEulerDiscreteScheduler,
     HiDreamImagePipeline,
     HiDreamImageTransformer2DModel,
@@ -282,6 +283,12 @@ def parse_args(input_args=None):
         default="meta-llama/Meta-Llama-3.1-8B-Instruct",
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--bnb_quantization_config_path",
+        type=str,
+        default=None,
+        help="Quantization config in a JSON file that will be used to define the bitsandbytes quant config of the DiT.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
@@ -1056,6 +1063,14 @@ def main(args):
         args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_3"
     )
 
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
     # Load scheduler and models
     noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
         args.pretrained_model_name_or_path, subfolder="scheduler", revision=args.revision, shift=3.0
@@ -1064,20 +1079,30 @@ def main(args):
     text_encoder_one, text_encoder_two, text_encoder_three, text_encoder_four = load_text_encoders(
         text_encoder_cls_one, text_encoder_cls_two, text_encoder_cls_three
     )
-
     vae = AutoencoderKL.from_pretrained(
         args.pretrained_model_name_or_path,
         subfolder="vae",
         revision=args.revision,
         variant=args.variant,
     )
+    quantization_config = None
+    if args.bnb_quantization_config_path is not None:
+        with open(args.bnb_quantization_config_path, "r") as f:
+            config_kwargs = json.load(f)
+            config_kwargs["bnb_4bit_compute_dtype"] = weight_dtype
+        quantization_config = BitsAndBytesConfig(**config_kwargs)
+
     transformer = HiDreamImageTransformer2DModel.from_pretrained(
         args.pretrained_model_name_or_path,
         subfolder="transformer",
         revision=args.revision,
         variant=args.variant,
+        quantization_config=quantization_config,
+        torch_dtype=weight_dtype,
         force_inference_output=True,
     )
+    if args.bnb_quantization_config_path is not None:
+        transformer = prepare_model_for_kbit_training(transformer, use_gradient_checkpointing=False)
 
     # We only train the additional adapter LoRA layers
     transformer.requires_grad_(False)
@@ -1087,14 +1112,6 @@ def main(args):
     text_encoder_three.requires_grad_(False)
     text_encoder_four.requires_grad_(False)
 
-    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
-    # as these weights are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "fp16":
-        weight_dtype = torch.float16
-    elif accelerator.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
     if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
         # due to pytorch#99272, MPS does not yet support bfloat16.
         raise ValueError(
@@ -1109,7 +1126,7 @@ def main(args):
     text_encoder_three.to(**to_kwargs)
     text_encoder_four.to(**to_kwargs)
     # we never offload the transformer to CPU, so we can just use the accelerator device
-    transformer.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device)
 
     # Initialize a text encoding pipeline and keep it to CPU for now.
     text_encoding_pipeline = HiDreamImagePipeline.from_pretrained(