fix cpu fallback device restore (#2664)

Qubitium · web-flow · commit f4608ab15614 · 2026-04-03T07:32:16.000+08:00
diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
@@ -254,6 +254,17 @@ def mock_hessian_inverse(self, H: torch.Tensor):
         identity = torch.eye(H.shape[0], dtype=torch.float32, device=H.device)
         return identity, damp
 
+    def log_cpu_fallback(self, stage: str, source_device: torch.device) -> None:
+        """Explain when a memory-heavy GPTQ step moves from CUDA to CPU."""
+
+        log.warn(
+            "Quantization: Module `%s` -> CUDA OOM during %s on %s; falling back to CPU. "
+            "Due to this fallback, the calculation may take much longer than normal.",
+            self.name,
+            stage,
+            source_device,
+        )
+
     def clone_module(self, copy=True, device: torch.device = None):
         if not device:
             device = self.module.weight.data.device
@@ -886,6 +897,8 @@ def quantize(
         start = time.time()
 
         target_device = getattr(self.module, "target_device", None)
+        result_device = torch.device(self.module.weight.data.device)
+        cpu_fallback_used = False
         from ..utils.fallback import resolve_fallback_strategy, resolve_threshold, should_use_fallback
 
         resolved_strategy = resolve_fallback_strategy(self.fallback)
@@ -971,11 +984,8 @@ def quantize(
                 if self.H.device.type != "cuda" or "out of memory" not in str(exc).lower():
                     raise
 
-                log.warn(
-                    "Quantization: Module `%s` -> CUDA OOM during Hessian permutation on %s; retrying that module on CPU.",
-                    self.name,
-                    self.H.device,
-                )
+                self.log_cpu_fallback("Hessian permutation", self.H.device)
+                cpu_fallback_used = True
                 cpu_device = torch.device("cpu")
                 perm = perm.to(device=cpu_device)
                 W = W.to(device=cpu_device)[:, perm]
@@ -1002,11 +1012,8 @@ def quantize(
                 if self.H.device.type != "cuda" or "out of memory" not in str(exc).lower():
                     raise
 
-                log.warn(
-                    "Quantization: Module `%s` -> CUDA OOM during act-group Hessian permutation on %s; retrying that module on CPU.",
-                    self.name,
-                    self.H.device,
-                )
+                self.log_cpu_fallback("act-group Hessian permutation", self.H.device)
+                cpu_fallback_used = True
                 cpu_device = torch.device("cpu")
                 final_perm = final_perm.to(device=cpu_device)
                 W = W.to(device=cpu_device)[:, final_perm]
@@ -1022,11 +1029,8 @@ def quantize(
 
                 # Full-attention blocks on very large models can exceed GPU memory during the
                 # dense Hessian inverse; finish that module on CPU instead of aborting the run.
-                log.warn(
-                    "Quantization: Module `%s` -> CUDA OOM during Hessian inverse on %s; retrying quantization on CPU.",
-                    self.name,
-                    self.H.device,
-                )
+                self.log_cpu_fallback("Hessian inverse", self.H.device)
+                cpu_fallback_used = True
                 cpu_device = torch.device("cpu")
                 self.H = self.H.to(device=cpu_device)
                 W = W.to(device=cpu_device)
@@ -1233,12 +1237,13 @@ def quantize(
         g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
 
         if self.qcfg.desc_act and use_hessian:
+            invperm = invperm.to(device=Q.device)
             Q = Q[:, invperm]
             g_idx = g_idx[invperm]
             del perm, invperm
 
         elif self.qcfg.act_group_aware and use_hessian:
-            inv_final = invert_perm(final_perm)
+            inv_final = invert_perm(final_perm).to(device=Q.device)
             Q = Q[:, inv_final]
             inv_global_perm = invert_perm(global_perm)
             inv_global_perm_list = inv_global_perm.tolist()
@@ -1273,7 +1278,14 @@ def quantize(
             scale = self.truncate_last_dim(scale, valid_cols)
             zero = self.truncate_last_dim(zero, valid_cols)
 
-        Q = Q.to(device=self.module.weight.data.device, non_blocking=False)
+        if cpu_fallback_used and Q.device != result_device:
+            log.info(
+                "Quantization: Module `%s` -> CPU fallback complete; moving final quantized weights back to %s.",
+                self.name,
+                result_device,
+            )
+
+        Q = Q.to(device=result_device, non_blocking=False)
 
         duration = time.time() - start
 
diff --git a/tests/test_gptq.py b/tests/test_gptq.py
@@ -85,6 +85,53 @@ def _run_batch(idx: int) -> None:
     return PathStats(per_batch_seconds=per_batch, total_seconds=total, peak_bytes=peak_bytes, batches_measured=measured)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for CPU fallback regression coverage")
+def test_gptq_cpu_hessian_fallback_returns_quantized_weights_to_original_cuda_device(monkeypatch):
+    device = torch.device("cuda", 0)
+    torch.cuda.set_device(device)
+    torch.manual_seed(0)
+
+    layer = _make_module(hidden_dim=8, device=device)
+    qcfg = QuantizeConfig(bits=4, group_size=2, act_group_aware=True)
+    gptq = GPTQ(layer, qcfg=qcfg)
+    gptq.quantizer.configure(perchannel=True)
+
+    inp = _generate_input(batch_size=1, seq_len=4, hidden_dim=8, device=device)
+    gptq.add_batch(inp, None)
+
+    calls = {"cuda": 0, "cpu": 0}
+
+    def _patched_hessian_inverse(self, hessian: torch.Tensor):
+        if hessian.device.type == "cuda":
+            calls["cuda"] += 1
+            raise RuntimeError("CUDA out of memory. simulated for regression test")
+
+        calls["cpu"] += 1
+        identity = torch.eye(hessian.shape[0], dtype=torch.float32, device=hessian.device)
+        return identity, self.qcfg.damp_percent
+
+    monkeypatch.setattr(GPTQ, "hessian_inverse", _patched_hessian_inverse)
+    log_messages = []
+
+    def _capture_warn(message, *args, **kwargs):
+        log_messages.append(message % args if args else message)
+
+    def _capture_info(message, *args, **kwargs):
+        log_messages.append(message % args if args else message)
+
+    monkeypatch.setattr(gptq_mod.log, "warn", _capture_warn)
+    monkeypatch.setattr(gptq_mod.log, "info", _capture_info)
+
+    qweight, _, _, _, *_ = gptq.quantize(blocksize=4)
+
+    assert calls == {"cuda": 1, "cpu": 1}
+    assert qweight.device == device
+    joined_logs = "\n".join(log_messages)
+    assert "falling back to CPU" in joined_logs
+    assert "may take much longer than normal" in joined_logs
+    assert "moving final quantized weights back" in joined_logs
+
+
 class TestGPTQAddBatchCPU(ModelTest):
     ######### test_gptq_add_batch_cpu.py ###########
     pytestmark = pytest.mark.skipif(
@@ -331,4 +378,3 @@ def get_random_word(self):
             pytest.skip(
                 f"Streaming event helper subprocess unavailable: rc={result.returncode}, stderr={result.stderr.strip()}"
             )
-