bugfix: guard fp8 e8m0 and e2m1 compile (#1287)

Edenzzzz · gemini-code-assist[bot] · web-flow · commit fe29ed63cb92 · 2025-07-20T23:37:59.000-07:00
## 📌 Description Fixes #1282 ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
@@ -419,12 +419,16 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   float outputScale;
   // Write the SF to global memory (STG.8).
   if constexpr (UE8M0_SF) {
+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
     __nv_fp8_e8m0 tmp;
     // Scale the max value to the range of E2m1.
     vecMax *= reciprocal_approximate_ftz(6.0f);
     tmp.__x = __nv_cvt_float_to_e8m0(vecMax, __NV_SATFINITE, cudaRoundPosInf);
     fp8SFVal = tmp.__x;
     outputScale = exp2f_rcp(fp8SFVal);
+#else
+#error "FP8 E8M0 support requires CUDA 12.8 or newer."
+#endif
   } else {
     // Get the SF (max value of the vector / max value of e2m1).
     // maximum value of e2m1 = 6.0.
@@ -511,16 +515,21 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   uint8_t fp8SFVal;
   // Write the SF to global memory (STG.8).
   if constexpr (UE8M0_SF) {
+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
     __nv_fp8_e8m0 tmp;
     tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
     SFValue = static_cast<float>(tmp);
     fp8SFVal = tmp.__x;
+#else
+#error "FP8 E8M0 support requires CUDA 12.8 or newer."
+#endif
   } else {
     // Here SFValue is always positive, so E4M3 is the same as UE4M3.
     __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
     fp8SFVal = tmp.__x;
     SFValue = static_cast<float>(tmp);
   }
+
   // Get the output scale.
   // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * reciprocal(SFScaleVal))
   float outputScale = SFValue != 0 ? SFScaleVal * reciprocal_approximate_ftz(SFValue) : 0.0f;
@@ -551,6 +560,7 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
 }
 
 // Quantizes the provided PackedVec into the uint64_t output
+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
 template <class Type, int SF_VEC_SIZE>
 __device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
@@ -612,6 +622,9 @@ __device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout)
   return 0;
 #endif
 }
+#else
+#error "FP8 E8M0 support requires CUDA 12.8 or newer."
+#endif
 
 inline __host__ __device__ int64_t get_sf_out_offset_128x4(std::optional<int> batchIdx, int mIdx,
                                                            int kIdx, std::optional<int> numRows,
diff --git a/csrc/pytorch_extension_utils.h b/csrc/pytorch_extension_utils.h
@@ -145,23 +145,39 @@ FLASHINFER_EXT_MODULE_INIT_EXPAND(TORCH_EXTENSION_NAME)
 
 // Should not be used together with _DISPATCH_SF_CASE_FP8_E8M0
 #ifdef FLASHINFER_ENABLE_FP4_E2M1
+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
 #define _DISPATCH_CASE_FP4_E2M1(c_type, ...) \
   case at::ScalarType::Byte: {               \
     using c_type = __nv_fp4_e2m1;            \
     return __VA_ARGS__();                    \
   }
 #else
+#define _DISPATCH_CASE_FP4_E2M1(c_type, ...)                               \
+  case at::ScalarType::Byte: {                                             \
+    static_assert(false, "FP4 E2M1 support requires CUDA 12.8 or newer."); \
+    break;                                                                 \
+  }
+#endif
+#else
 #define _DISPATCH_CASE_FP4_E2M1(c_type, ...)
 #endif
 
 // Should not be used together with _DISPATCH_CASE_FP4_E2M1
 #ifdef FLASHINFER_ENABLE_FP8_E8M0
+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
 #define _DISPATCH_SF_CASE_FP8_E8M0(c_type, ...) \
   case at::ScalarType::Byte: {                  \
     using c_type = __nv_fp8_e8m0;               \
     return __VA_ARGS__();                       \
   }
 #else
+#define _DISPATCH_SF_CASE_FP8_E8M0(c_type, ...)                            \
+  case at::ScalarType::Byte: {                                             \
+    static_assert(false, "FP8 E8M0 support requires CUDA 12.8 or newer."); \
+    break;                                                                 \
+  }
+#endif
+#else
 #define _DISPATCH_SF_CASE_FP8_E8M0(c_type, ...)
 #endif
 
diff --git a/include/flashinfer/comm/trtllm_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_allreduce_fusion.cuh
@@ -623,10 +623,14 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(vec_t<T, VEC_SIZE>& vec, float SFScaleV
   uint8_t fp8SFVal;
   // Write the SF to global memory (STG.8).
   if constexpr (UE8M0_SF) {
+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
     __nv_fp8_e8m0 tmp;
     tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
     SFValue = static_cast<float>(tmp);
     fp8SFVal = tmp.__x;
+#else
+#error "FP8 E8M0 support requires CUDA 12.8 or newer."
+#endif
   } else {
     // Here SFValue is always positive, so E4M3 is the same as UE4M3.
     __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
diff --git a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
@@ -610,10 +610,14 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(vec_t<T, VEC_SIZE>& vec, float SFScaleV
   uint8_t fp8SFVal;
   // Write the SF to global memory (STG.8).
   if constexpr (UE8M0_SF) {
+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
     __nv_fp8_e8m0 tmp;
     tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
     SFValue = static_cast<float>(tmp);
     fp8SFVal = tmp.__x;
+#else
+#error "FP8 E8M0 support requires CUDA 12.8 or newer."
+#endif
   } else {
     // Here SFValue is always positive, so E4M3 is the same as UE4M3.
     __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
diff --git a/include/flashinfer/cutlass_utils.cuh b/include/flashinfer/cutlass_utils.cuh
@@ -71,6 +71,7 @@ struct cutlass_dtype<__nv_fp8_e5m2> {
   using type = cutlass::float_e5m2_t;
 };
 
+#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
 template <>
 struct cutlass_dtype<__nv_fp8_e8m0> {
   using type = cutlass::float_ue8m0_t;
@@ -82,6 +83,7 @@ struct cutlass_dtype<__nv_fp4_e2m1> {
   using type = cutlass::float_e2m1_t;
 };
 #endif
+#endif
 
 template <typename T>
 using cutlass_dtype_t = typename cutlass_dtype<T>::type;