use static cuda ctx for triton kernel launch (#2269)

Qubitium · web-flow · commit faa3c0f0292d · 2025-12-15T18:29:12.000+08:00
* use static cuda ctx for triton kernel launch

* fix for xpu
diff --git a/gptqmodel/nn_modules/triton_utils/dequant.py b/gptqmodel/nn_modules/triton_utils/dequant.py
@@ -264,20 +264,21 @@ def dequant(dtype, qweight, scales, qzeros, g_idx, bits, pack_bits, maxq):
     numels = out.numel()
     grid = lambda meta: (triton.cdiv(numels, meta["X_BLOCK"]),)  # noqa: E731
 
-    dequant_kernel[grid](
-        g_idx,
-        scales,
-        qweight,
-        qzeros,
-        out,
-        torch_dtype_to_triton(out_dtype),
-        numels,
-        pack_bits=pack_bits,
-        maxq=maxq,
-        bits=bits,
-        out_features=out_features,
-        num_groups=num_groups,
-    )
+    with torch.xpu.device(qweight.device) if HAS_XPU else torch.cuda.device(qweight.device):
+        dequant_kernel[grid](
+            g_idx,
+            scales,
+            qweight,
+            qzeros,
+            out,
+            torch_dtype_to_triton(out_dtype),
+            numels,
+            pack_bits=pack_bits,
+            maxq=maxq,
+            bits=bits,
+            out_features=out_features,
+            num_groups=num_groups,
+        )
     return out