Fix INT8 quantization error on Blackwell GPUs (SM100+) (#25935)

certainly-param · web-flow · commit 99028fda449d · 2025-09-30T19:19:53.000-07:00
Signed-off-by: padg9912 &lt;phone.and.desktop@gmail.com&gt;
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
@@ -25,7 +25,10 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
       if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
         int8_func(c, a, b, a_scales, b_scales, bias);
       } else {
-        TORCH_CHECK(false, "Int8 not supported for this architecture");
+        int32_t version_num = get_sm_version_num();
+        TORCH_CHECK(
+            false, "Int8 not supported on SM", version_num,
+            ". Use FP8 quantization instead, or run on older arch (SM < 100).");
       }
     }
   } else {
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
@@ -6,7 +6,11 @@ This quantization method is particularly useful for reducing model size while ma
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
 
 !!! note
-    INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
+    INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
+
+!!! warning
+    **Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell).
+    Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures.
 
 ## Prerequisites
 

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,10 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,`
`25`	`25`	`if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {`
`26`	`26`	`int8_func(c, a, b, a_scales, b_scales, bias);`
`27`	`27`	`} else {`
`28`		`- TORCH_CHECK(false, "Int8 not supported for this architecture");`
	`28`	`+ int32_t version_num = get_sm_version_num();`
	`29`	`+ TORCH_CHECK(`
	`30`	`+ false, "Int8 not supported on SM", version_num,`
	`31`	`+ ". Use FP8 quantization instead, or run on older arch (SM < 100).");`
`29`	`32`	`}`
`30`	`33`	`}`
`31`	`34`	`} else {`