axolotl-ai-cloud · winglian · Sep 9, 2025 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025
diff --git a/src/axolotl/monkeypatch/accelerate/fsdp2.py b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -178,6 +178,38 @@ def get_state_dict(self, model, unwrap=True):
 
     return state_dict
 
+def cast_lora_module(module):
+    base_layer_dtype = module.base_layer.weight.dtype
+    # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
+    # wrap this. Therefore we must ensure the bias has the same dtype as the weight
+    if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
+        if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
+            log_bias_dtype_mismatch = True
+            module.base_layer.bias.data = module.base_layer.bias.data.to(
+                module.base_layer.weight.dtype
+            )
+
+    for active_adapter in module.active_adapters:
+        if module.lora_A:
+            module.lora_A[active_adapter] = module.lora_A[active_adapter].to(base_layer_dtype)
+            if hasattr(module.lora_A[active_adapter], 'bias') and module.lora_A[active_adapter].bias is not None:
+                module.lora_A[active_adapter].bias.data = module.lora_A[active_adapter].bias.data.to(base_layer_dtype)
+        if module.lora_B:
+           module.lora_B[active_adapter] = module.lora_B[active_adapter].to(base_layer_dtype)
+           if hasattr(module.lora_B[active_adapter], 'bias') and module.lora_B[active_adapter].bias is not None:
+               module.lora_B[active_adapter].bias.data = module.lora_B[active_adapter].bias.data.to(base_layer_dtype)
+        if module.lora_embedding_A:
+            module.lora_embedding_A[active_adapter] = module.lora_embedding_A[active_adapter].to(base_layer_dtype)
+            if hasattr(module.lora_embedding_A[active_adapter], 'bias') and module.lora_embedding_A[active_adapter].bias is not None:
+                module.lora_embedding_A[active_adapter].bias.data = module.lora_embedding_A[active_adapter].bias.data.to(base_layer_dtype)
+        if module.lora_embedding_B:
+            module.lora_embedding_B[active_adapter] = module.lora_embedding_B[active_adapter].to(base_layer_dtype)
+            if hasattr(module.lora_embedding_B[active_adapter], 'bias') and module.lora_embedding_B[active_adapter].bias is not None:
+                module.lora_embedding_B[active_adapter].bias.data = module.lora_embedding_B[active_adapter].bias.data.to(base_layer_dtype)
+        if module.lora_magnitude_vector:
+            module.lora_magnitude_vector[active_adapter] = module.lora_magnitude_vector[active_adapter].to(base_layer_dtype)
+            if hasattr(module.lora_magnitude_vector[active_adapter], 'bias') and module.lora_magnitude_vector[active_adapter].bias is not None:
+                module.lora_magnitude_vector[active_adapter].bias.data = module.lora_magnitude_vector[active_adapter].bias.data.to(base_layer_dtype)
 
-def cast_lora_module(module):
-    base_layer_dtype = module.base_layer.weight.dtype
-    # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
-    # wrap this. Therefore we must ensure the bias has the same dtype as the weight
-    if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
-        if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
-            log_bias_dtype_mismatch = True
-            module.base_layer.bias.data = module.base_layer.bias.data.to(
-                module.base_layer.weight.dtype
-            )
-
-    for active_adapter in module.active_adapters:
-        if module.lora_A:
-            module.lora_A[active_adapter] = module.lora_A[active_adapter].to(base_layer_dtype)
-            if hasattr(module.lora_A[active_adapter], 'bias') and module.lora_A[active_adapter].bias is not None:
-                module.lora_A[active_adapter].bias.data = module.lora_A[active_adapter].bias.data.to(base_layer_dtype)
-        if module.lora_B:
-           module.lora_B[active_adapter] = module.lora_B[active_adapter].to(base_layer_dtype)
-           if hasattr(module.lora_B[active_adapter], 'bias') and module.lora_B[active_adapter].bias is not None:
-               module.lora_B[active_adapter].bias.data = module.lora_B[active_adapter].bias.data.to(base_layer_dtype)
-        if module.lora_embedding_A:
-            module.lora_embedding_A[active_adapter] = module.lora_embedding_A[active_adapter].to(base_layer_dtype)
-            if hasattr(module.lora_embedding_A[active_adapter], 'bias') and module.lora_embedding_A[active_adapter].bias is not None:
-                module.lora_embedding_A[active_adapter].bias.data = module.lora_embedding_A[active_adapter].bias.data.to(base_layer_dtype)
-        if module.lora_embedding_B:
-            module.lora_embedding_B[active_adapter] = module.lora_embedding_B[active_adapter].to(base_layer_dtype)
-            if hasattr(module.lora_embedding_B[active_adapter], 'bias') and module.lora_embedding_B[active_adapter].bias is not None:
-                module.lora_embedding_B[active_adapter].bias.data = module.lora_embedding_B[active_adapter].bias.data.to(base_layer_dtype)
-        if module.lora_magnitude_vector:
-            module.lora_magnitude_vector[active_adapter] = module.lora_magnitude_vector[active_adapter].to(base_layer_dtype)
-            if hasattr(module.lora_magnitude_vector[active_adapter], 'bias') and module.lora_magnitude_vector[active_adapter].bias is not None:
-                module.lora_magnitude_vector[active_adapter].bias.data = module.lora_magnitude_vector[active_adapter].bias.data.to(base_layer_dtype)
+def cast_lora_module(module):
+    weight_dtype = module.base_layer.weight.dtype
+    # Linear4Bit keeps its bias term in fp32; if the weight dtype is bf16 we must match bias to weight.
+    # For adapter params, prefer the module's compute dtype when available; fall back to a floating weight dtype.
+    compute_dtype = getattr(module.base_layer, "compute_dtype", None)
+    float_dtypes = (torch.float16, torch.bfloat16, torch.float32)
+    target_dtype = (
+        compute_dtype
+        if compute_dtype in float_dtypes
+        else (weight_dtype if weight_dtype in float_dtypes else None)
+    )
+
+    bias_dtype_mismatch = False
+    if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
+        if module.base_layer.bias.dtype != weight_dtype:
+            module.base_layer.bias.data = module.base_layer.bias.data.to(weight_dtype)
+            bias_dtype_mismatch = True
+
+    # If we don't have a safe floating target dtype (e.g., quantized weight with no compute dtype), stop here.
+    if target_dtype is None:
+        return bias_dtype_mismatch
+
+    for active_adapter in module.active_adapters:
+        if getattr(module, "lora_A", None) and active_adapter in module.lora_A:
+            module.lora_A[active_adapter].to(dtype=target_dtype)
+            if getattr(module.lora_A[active_adapter], "bias", None) is not None:
+                module.lora_A[active_adapter].bias.data = module.lora_A[active_adapter].bias.data.to(target_dtype)
+        if getattr(module, "lora_B", None) and active_adapter in module.lora_B:
+            module.lora_B[active_adapter].to(dtype=target_dtype)
+            if getattr(module.lora_B[active_adapter], "bias", None) is not None:
+                module.lora_B[active_adapter].bias.data = module.lora_B[active_adapter].bias.data.to(target_dtype)
+        if getattr(module, "lora_embedding_A", None) and active_adapter in module.lora_embedding_A:
+            module.lora_embedding_A[active_adapter].to(dtype=target_dtype)
+            if getattr(module.lora_embedding_A[active_adapter], "bias", None) is not None:
+                module.lora_embedding_A[active_adapter].bias.data = module.lora_embedding_A[active_adapter].bias.data.to(target_dtype)
+        if getattr(module, "lora_embedding_B", None) and active_adapter in module.lora_embedding_B:
+            module.lora_embedding_B[active_adapter].to(dtype=target_dtype)
+            if getattr(module.lora_embedding_B[active_adapter], "bias", None) is not None:
+                module.lora_embedding_B[active_adapter].bias.data = module.lora_embedding_B[active_adapter].bias.data.to(target_dtype)
+        if getattr(module, "lora_magnitude_vector", None) and active_adapter in module.lora_magnitude_vector:
+            param = module.lora_magnitude_vector[active_adapter]
+            if isinstance(param, torch.nn.Parameter):
+                param.data = param.data.to(target_dtype)
+            else:
+                module.lora_magnitude_vector[active_adapter] = torch.nn.Parameter(
+                    param.to(target_dtype), requires_grad=getattr(param, "requires_grad", True)
+                )
+    return bias_dtype_mismatch
-def cast_lora_module(module):
-    base_layer_dtype = module.base_layer.weight.dtype
-    # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
-    # wrap this. Therefore we must ensure the bias has the same dtype as the weight
-    if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
-        if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
-            log_bias_dtype_mismatch = True
-            module.base_layer.bias.data = module.base_layer.bias.data.to(
-                module.base_layer.weight.dtype
-            )
-
-    for active_adapter in module.active_adapters:
-        if module.lora_A:
-            module.lora_A[active_adapter] = module.lora_A[active_adapter].to(base_layer_dtype)
-            if hasattr(module.lora_A[active_adapter], 'bias') and module.lora_A[active_adapter].bias is not None:
-                module.lora_A[active_adapter].bias.data = module.lora_A[active_adapter].bias.data.to(base_layer_dtype)
-        if module.lora_B:
-           module.lora_B[active_adapter] = module.lora_B[active_adapter].to(base_layer_dtype)
-           if hasattr(module.lora_B[active_adapter], 'bias') and module.lora_B[active_adapter].bias is not None:
-               module.lora_B[active_adapter].bias.data = module.lora_B[active_adapter].bias.data.to(base_layer_dtype)
-        if module.lora_embedding_A:
-            module.lora_embedding_A[active_adapter] = module.lora_embedding_A[active_adapter].to(base_layer_dtype)
-            if hasattr(module.lora_embedding_A[active_adapter], 'bias') and module.lora_embedding_A[active_adapter].bias is not None:
-                module.lora_embedding_A[active_adapter].bias.data = module.lora_embedding_A[active_adapter].bias.data.to(base_layer_dtype)
-        if module.lora_embedding_B:
-            module.lora_embedding_B[active_adapter] = module.lora_embedding_B[active_adapter].to(base_layer_dtype)
-            if hasattr(module.lora_embedding_B[active_adapter], 'bias') and module.lora_embedding_B[active_adapter].bias is not None:
-                module.lora_embedding_B[active_adapter].bias.data = module.lora_embedding_B[active_adapter].bias.data.to(base_layer_dtype)
-        if module.lora_magnitude_vector:
-            module.lora_magnitude_vector[active_adapter] = module.lora_magnitude_vector[active_adapter].to(base_layer_dtype)
-            if hasattr(module.lora_magnitude_vector[active_adapter], 'bias') and module.lora_magnitude_vector[active_adapter].bias is not None:
-                module.lora_magnitude_vector[active_adapter].bias.data = module.lora_magnitude_vector[active_adapter].bias.data.to(base_layer_dtype)
+def cast_lora_module(module):
+    weight_dtype = module.base_layer.weight.dtype
+    # Linear4Bit keeps its bias term in fp32; if the weight dtype is bf16 we must match bias to weight.
+    # For adapter params, prefer the module's compute dtype when available; fall back to a floating weight dtype.
+    compute_dtype = getattr(module.base_layer, "compute_dtype", None)
+    float_dtypes = (torch.float16, torch.bfloat16, torch.float32)
+    target_dtype = (
+        compute_dtype
+        if compute_dtype in float_dtypes
+        else (weight_dtype if weight_dtype in float_dtypes else None)
+    )
+
+    bias_dtype_mismatch = False
+    if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
+        if module.base_layer.bias.dtype != weight_dtype:
+            module.base_layer.bias.data = module.base_layer.bias.data.to(weight_dtype)
+            bias_dtype_mismatch = True
+
+    # If we don't have a safe floating target dtype (e.g., quantized weight with no compute dtype), stop here.
+    if target_dtype is None:
+        return bias_dtype_mismatch
+
+    for active_adapter in module.active_adapters:
+        if getattr(module, "lora_A", None) and active_adapter in module.lora_A:
+            module.lora_A[active_adapter].to(dtype=target_dtype)
+            if getattr(module.lora_A[active_adapter], "bias", None) is not None:
+                module.lora_A[active_adapter].bias.data = module.lora_A[active_adapter].bias.data.to(target_dtype)
+        if getattr(module, "lora_B", None) and active_adapter in module.lora_B:
+            module.lora_B[active_adapter].to(dtype=target_dtype)
+            if getattr(module.lora_B[active_adapter], "bias", None) is not None:
+                module.lora_B[active_adapter].bias.data = module.lora_B[active_adapter].bias.data.to(target_dtype)
+        if getattr(module, "lora_embedding_A", None) and active_adapter in module.lora_embedding_A:
+            module.lora_embedding_A[active_adapter].to(dtype=target_dtype)
+            if getattr(module.lora_embedding_A[active_adapter], "bias", None) is not None:
+                module.lora_embedding_A[active_adapter].bias.data = module.lora_embedding_A[active_adapter].bias.data.to(target_dtype)
+        if getattr(module, "lora_embedding_B", None) and active_adapter in module.lora_embedding_B:
+            module.lora_embedding_B[active_adapter].to(dtype=target_dtype)
+            if getattr(module.lora_embedding_B[active_adapter], "bias", None) is not None:
+                module.lora_embedding_B[active_adapter].bias.data = module.lora_embedding_B[active_adapter].bias.data.to(target_dtype)
+        if getattr(module, "lora_magnitude_vector", None) and active_adapter in module.lora_magnitude_vector:
+            param = module.lora_magnitude_vector[active_adapter]
+            if isinstance(param, torch.nn.Parameter):
+                param.data = param.data.to(target_dtype)
+            else:
+                module.lora_magnitude_vector[active_adapter] = torch.nn.Parameter(
+                    param.to(target_dtype), requires_grad=getattr(param, "requires_grad", True)
+                )
+    return bias_dtype_mismatch
 def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
     """Helper function to process LoRA modules for FSDP2."""
@@ -324,10 +356,11 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
     if auto_wrap_policy is not None:
         for module in get_module_children_bottom_up(model)[:-1]:
             if is_peft_model and isinstance(module, LoraLayer):
-                module_log_bias_mismatch = _process_lora_module_for_fsdp(
-                    module, fsdp2_kwargs
-                )
-                log_bias_dtype_mismatch |= module_log_bias_mismatch
+                cast_lora_module(module)
+                # module_log_bias_mismatch = _process_lora_module_for_fsdp(
+                #     module, fsdp2_kwargs
+                # )
+                # log_bias_dtype_mismatch |= module_log_bias_mismatch
             if auto_wrap_policy(module) and not isinstance(module, FSDPModule):
                 fully_shard(module, **fsdp2_kwargs)