support qlora

mori360 · mori360 · commit 32fc92bb3472 · 2026-03-04T16:51:08.000-08:00
ghstack-source-id: db613d3 Pull Request resolved: #2487
diff --git a/torchtitan/components/lora.py b/torchtitan/components/lora.py
@@ -96,11 +96,29 @@ class Config(Configurable.Config):
         Requires base model to be loaded from HF/initial_load_path on resume.
         Set to False to save full model weights for debugging without pretrained base."""
 
+        quantize_base: str = ""
+        """Quantize base (non-LoRA) weights. "" = no quantization, "nf4" = NF4 (QLoRA).
+        NF4 quantization reduces base weight memory ~4x while keeping LoRA adapters in full precision."""
+
+        nf4_scaler_block_size: int = 128
+        """Scaler block size for NF4 quantization. Default 128 works with debugmodel on 8 GPUs.
+        The default torchao value (256) may be too large for sharded tensors."""
+
     def __init__(self, config: Config, **kwargs):
         self.rank = config.rank
         self.alpha = config.alpha
         self.save_adapter_only = config.save_adapter_only
-        logger.info(f"LoRA training active with rank={self.rank}, alpha={self.alpha}")
+        self.quantize_base = config.quantize_base
+        self.nf4_scaler_block_size = config.nf4_scaler_block_size
+        if self.quantize_base and self.quantize_base != "nf4":
+            raise ValueError(
+                f"Unsupported quantize_base value: '{self.quantize_base}'. "
+                "Supported values: '' (none), 'nf4'."
+            )
+        logger.info(
+            f"LoRA training active with rank={self.rank}, alpha={self.alpha}"
+            + (f", quantize_base={self.quantize_base}" if self.quantize_base else "")
+        )
 
     def convert(self, model: nn.Module) -> None:
         model.requires_grad_(False)
@@ -134,6 +152,58 @@ def converter_key_filter(key: str) -> bool:
         object.__setattr__(module, "converter_key_filter", converter_key_filter)
         object.__setattr__(module, "save_converter_keys_only", self.save_adapter_only)
 
+        # Register a one-shot forward pre-hook to quantize base weights after
+        # checkpoint load but before the first forward pass (QLoRA).
+        if self.quantize_base == "nf4":
+            from torch.distributed.tensor import DTensor
+            from torchao.dtypes.nf4tensor import to_nf4
+
+            lora_classes = tuple(_lora_class_cache.values())
+            nf4_scaler_block_size = self.nf4_scaler_block_size
+
+            def _to_nf4_tensor(weight: torch.Tensor) -> torch.Tensor:
+                """Convert weight to NF4, handling both regular tensors and DTensors."""
+                nf4_block_size = 64  # NF4 default block size
+                is_dtensor = isinstance(weight, DTensor)
+                local_weight = weight.to_local() if is_dtensor else weight
+
+                num_scalers = local_weight.numel() // nf4_block_size
+                assert num_scalers % nf4_scaler_block_size == 0, (
+                    f"NF4 quantization failed: num_scalers ({num_scalers}) is not "
+                    f"divisible by nf4_scaler_block_size ({nf4_scaler_block_size}). "
+                    f"Try a smaller nf4_scaler_block_size in LoRAConverter.Config "
+                    f"(e.g., 64, 32, or 1)."
+                )
+
+                nf4_local = to_nf4(
+                    local_weight, scaler_block_size=nf4_scaler_block_size
+                )
+
+                if is_dtensor:
+                    return DTensor.from_local(
+                        nf4_local, weight.device_mesh, weight.placements
+                    )
+                return nf4_local
+
+            def _quantize_hook(
+                mod: nn.Module, args: Any, handle: torch.utils.hooks.RemovableHandle
+            ) -> None:
+                for sub in mod.modules():
+                    if isinstance(sub, lora_classes):
+                        sub.weight = nn.Parameter(
+                            _to_nf4_tensor(sub.weight.data), requires_grad=False
+                        )
+                logger.info("QLoRA: quantized base weights to NF4")
+                handle.remove()
+
+            # Use a list to allow the closure to reference the handle before it exists
+            handle_ref: list[torch.utils.hooks.RemovableHandle] = []
+            handle_ref.append(
+                module.register_forward_pre_hook(
+                    lambda mod, args: _quantize_hook(mod, args, handle_ref[0])
+                )
+            )
+
     def post_optimizer_hook(self, model: nn.Module | list[nn.Module]) -> None:
         pass
 
diff --git a/torchtitan/models/llama3/config_registry.py b/torchtitan/models/llama3/config_registry.py
@@ -130,6 +130,20 @@ def llama3_debugmodel_lora() -> Trainer.Config:
     return config
 
 
+def llama3_debugmodel_qlora() -> Trainer.Config:
+    config = llama3_debugmodel_lora()
+    config.model_converters = ModelConvertersContainer.Config(
+        converters=[
+            LoRAConverter.Config(
+                rank=8,
+                alpha=16.0,
+                quantize_base="nf4",
+            ),
+        ],
+    )
+    return config
+
+
 def llama3_8b() -> Trainer.Config:
     return Trainer.Config(
         hf_assets_path="./assets/hf/Llama-3.1-8B",