Fix bf16 dtype mismatch in ZeRO-3 with zero_quantized_weights

juyterman1000 · juyterman1000 · commit 49edc46a8444 · 2026-01-18T11:04:09.000-08:00
When using ZeRO-3 with zero_quantized_weights=True and bf16 enabled, the dequantized weights were incorrectly cast to fp16 instead of preserving the original bf16 dtype. This caused RuntimeError during training with BERT and similar models. The fix adds original_dtype tracking to AllGatherCoalescedHandle, mirroring the existing pattern in AllGatherHandle, to ensure weights are converted back to their original dtype after dequantization. Fixes deepspeedai#7775 Signed-off-by: juyterman1000 <fastrunner10090@gmail.com>
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
@@ -713,6 +713,7 @@ def __init__(
         world_size: int,
         use_secondary_tensor=False,
         quantization=None,
+        original_dtype=None,
     ) -> None:
         self.allgather_handle = allgather_handle
         self.params = params
@@ -721,6 +722,7 @@ def __init__(
         self.use_secondary_tensor = use_secondary_tensor
         self.complete = False
         self.quantization = quantization
+        self.original_dtype = original_dtype
 
         for param in self.params:
             if param.ds_status != ZeroParamStatus.INFLIGHT:
@@ -735,8 +737,13 @@ def wait(self, handle_dependency=True) -> None:
 
         if self.quantization:
             instrument_w_nvtx(self.quantization.quant_handle.wait)()
-            flat_tensor = self.quantization.backend.dequantize(
-                self.quantization.quantized_param, self.quantization.scale_buffer).to(self.params[0].device)
+            # Fix for issue #7775: convert dequantized tensor back to original dtype (e.g., bf16)
+            # to prevent dtype mismatch when zero_quantized_weights is used with bf16
+            dequantized = self.quantization.backend.dequantize(
+                self.quantization.quantized_param, self.quantization.scale_buffer)
+            if self.original_dtype is not None:
+                dequantized = dequantized.to(self.original_dtype)
+            flat_tensor = dequantized.to(self.params[0].device)
 
             self.partitions: List[Parameter] = []
             for i in range(self.world_size):
@@ -1469,13 +1476,16 @@ def all_gather_coalesced(params: Iterable[Parameter],
                         quant_info.scale_buffer = quant_scale_buffer
                         quant_info.partition_sz = partition_sz
                         quant_info.world_size = world_size
+                        # Get the original dtype from param's ds_tensor for proper dtype restoration after dequantization
+                        original_dtype = params[0].ds_tensor.dtype if params else None
                         return AllGatherCoalescedHandle(
                             allgather_handle=handle,
                             params=params,
                             partitions=None,
                             world_size=world_size,
                             use_secondary_tensor=use_secondary_tensor,
                             quantization=quant_info,
+                            original_dtype=original_dtype,
                         )
 
         def partition(param_list=None, hierarchy=0, has_been_updated=False, free_data=True):