Merge pull request #7 from codewithdark-git/fix/awq-quantized-linear-device-issue

codewithdark-git · web-flow · commit 54e44e64e4a9 · 2025-05-25T13:31:00.000+05:00
Fix: Unify nn.Module device placement across all quantizers and base …
diff --git a/quantllm/quant/awq.py b/quantllm/quant/awq.py
@@ -195,7 +195,7 @@ def _quantize_layer(
         quantized = quantized.to(target_device)
 
         # Ensure layer parameters are on the target_device for computation
-        layer = move_to_device(layer, target_device)
+        layer = layer.to(target_device)
 
         # Copy bias if exists, ensuring it's on the target device
         if layer.bias is not None:
diff --git a/quantllm/quant/gguf.py b/quantllm/quant/gguf.py
@@ -203,7 +203,7 @@ def _quantize_layer(
         """Quantize a single layer to GGUF format with memory-efficient processing."""
         target_device = torch.device('cpu') if self.cpu_offload else self.device_manager.primary_device
         
-        layer = move_to_device(layer, target_device)
+        layer = layer.to(target_device)
 
         # Initialize quantized layer and move to target_device
         quantized = QuantizedLinear(
diff --git a/quantllm/quant/gptq.py b/quantllm/quant/gptq.py
@@ -187,7 +187,7 @@ def _quantize_layer(self, layer: nn.Linear, H: torch.Tensor) -> QuantizedLinear:
         # Ensure tensors are on the correct device
         H = move_to_device(H, target_device)
         # Original layer's weights should be moved to target_device before processing
-        layer = move_to_device(layer, target_device)
+        layer = layer.to(target_device)
         W = layer.weight.data # W is now on target_device
         
         # Initialize quantized layer