Support cuda<12.8 built for trtllm_allreduce_fusion. (#1508)

strgrb · Zhang Kaihong · web-flow · commit 4c1ac5f61e0f · 2025-08-18T15:50:45.000-07:00
## 📌 Description  I want to use trtllm_allreduce_fusion with cuda<12.8 for hopper gpu in sglang, so I wrap fp4 code with cuda version check to make it compiled. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: Zhang Kaihong <zhangkaihong.zkh@alibaba-inc.com>
diff --git a/include/flashinfer/comm/trtllm_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_allreduce_fusion.cuh
@@ -1,7 +1,11 @@
 #include <cooperative_groups.h>
+#include <cuda.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+
+#if CUDA_VERSION >= 120800
 #include <cuda_fp4.h>
+#endif
 
 #include <cuda/std/optional>
 #include <tuple>
@@ -532,6 +536,7 @@ __forceinline__ __device__ uint32_t pack_bytes(uint8_t c0, uint8_t c1, uint8_t c
   return (val3 << 24) | (val2 << 16) | (val1 << 8) | val0;
 }
 
+#if CUDA_VERSION >= 120800
 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
 // NOTE: bypass sm_100 requirement by __nv_cvt_float2_to_fp4x2
 inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
@@ -672,6 +677,8 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(vec_t<T, VEC_SIZE>& vec, float SFScaleV
 #endif
 }
 
+#endif
+
 }  // namespace utils
 
 template <typename T, uint32_t VEC_SIZE>
@@ -943,14 +950,17 @@ class FusedOp {
       }
     }
 
+#if CUDA_VERSION >= 120800
     if constexpr (GetQuantType<Pattern> == QuantType::kFP4) {
       // NOTE(Yingyi): might update later
       auto sf_out = utils::cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2>(
           std::nullopt /* batchIdx */, token_id, m_access_id_in_token, std::nullopt /* numRows */,
           m_params.hidden_dim, reinterpret_cast<uint32_t*>(m_params.scale_out), m_params.layout);
       reinterpret_cast<uint32_t*>(m_params.quant_out)[m_access_id] =
           utils::cvt_warp_fp16_to_fp4<T, VEC_SIZE>(val, m_scale_factor, sf_out);
-    } else if constexpr (GetQuantType<Pattern> == QuantType::kFP8) {
+    } else
+#endif
+        if constexpr (GetQuantType<Pattern> == QuantType::kFP8) {
       using PackedQuantizedType = std::conditional_t<std::is_same_v<T, float>, float, float2>;
       PackedQuantizedType ret;
 #pragma unroll
@@ -1431,7 +1441,7 @@ cudaError_t allreduce_fusion_op(AllReduceFusionParams<T> const& params, bool lau
       DISPATCH_ACC_TYPE(T, AllReduceFusionPattern::kARResidualRMSNormFP8Quant, NRanks);      \
       break;                                                                                 \
     case AllReduceFusionPattern::kARResidualRMSNormFP4Quant:                                 \
-      if constexpr (!std::is_same_v<T, float>) {                                             \
+      if constexpr (!std::is_same_v<T, float> && CUDA_VERSION >= 120800) {                   \
         DISPATCH_ACC_TYPE(T, AllReduceFusionPattern::kARResidualRMSNormFP4Quant, NRanks);    \
       } else {                                                                               \
         FLASHINFER_CHECK(false, "FP4Quant pattern cannot work with DType=float!");           \
@@ -1441,7 +1451,7 @@ cudaError_t allreduce_fusion_op(AllReduceFusionParams<T> const& params, bool lau
       DISPATCH_ACC_TYPE(T, AllReduceFusionPattern::kARResidualRMSNormOutFP8Quant, NRanks);   \
       break;                                                                                 \
     case AllReduceFusionPattern::kARResidualRMSNormOutFP4Quant:                              \
-      if constexpr (!std::is_same_v<T, float>) {                                             \
+      if constexpr (!std::is_same_v<T, float> && CUDA_VERSION >= 120800) {                   \
         DISPATCH_ACC_TYPE(T, AllReduceFusionPattern::kARResidualRMSNormOutFP4Quant, NRanks); \
       } else {                                                                               \
         FLASHINFER_CHECK(false, "OutFP4Quant pattern cannot work with DType=float!");        \
diff --git a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
@@ -1,7 +1,11 @@
 #include <cooperative_groups.h>
+#include <cuda.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+
+#if CUDA_VERSION >= 120800
 #include <cuda_fp4.h>
+#endif
 
 #include <cuda/std/optional>
 #include <tuple>
@@ -519,6 +523,7 @@ __forceinline__ __device__ uint32_t pack_bytes(uint8_t c0, uint8_t c1, uint8_t c
   return (val3 << 24) | (val2 << 16) | (val1 << 8) | val0;
 }
 
+#if CUDA_VERSION >= 120800
 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
 // NOTE:bypass sm_100 requirement by __nv_cvt_float2_to_fp4x2
 inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
@@ -658,6 +663,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(vec_t<T, VEC_SIZE>& vec, float SFScaleV
   return 0;
 #endif
 }
+#endif
 }  // namespace utils
 
 template <typename T>
@@ -828,6 +834,7 @@ __device__ __forceinline__ void fused_op(vec_t<T, VEC_SIZE> const& val, int acce
   if constexpr (NormOut) {
     norm_val.store(reinterpret_cast<T*>(params.norm_out) + access_id * VEC_SIZE);
   }
+#if CUDA_VERSION >= 120800
   if constexpr (QuantOut) {
     constexpr int SF_VEC_SIZE = 16;
     auto sf_out = utils::cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2>(
@@ -836,6 +843,7 @@ __device__ __forceinline__ void fused_op(vec_t<T, VEC_SIZE> const& val, int acce
     reinterpret_cast<uint32_t*>(params.quant_out)[access_id] =
         utils::cvt_warp_fp16_to_fp4<T, VEC_SIZE>(norm_val, params.scale_factor, sf_out);
   }
+#endif
 }
 
 template <typename T>
@@ -1486,6 +1494,12 @@ cudaError_t moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams<T>
   auto status = DISPATCH_MOEFINALIZEREDUCTION(
       params.nranks, params.residual_out, params.rms_gamma, params.quant_out, N_RANKS, RES, RMS,
       QUANT, [&]() -> cudaError_t {
+        if constexpr (CUDA_VERSION < 120800 && QUANT) {
+          FLASHINFER_CHECK(false,
+                           "cuda version should be greater equal than 12.8 with "
+                           "trtllm_moe_allreduce_fusion quant");
+          return cudaErrorNotSupported;
+        }
         FLASHINFER_CUDA_CALL(
             (moefinalize_allreduce_fusion_kernel_launcher<T, N_RANKS, RES, RMS, QUANT>(
                 (params), (launch_with_pdl))));