flashinfer-ai
diff --git a/‎.github/workflows/release_wheel_aarch64.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/release_wheel_aarch64.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/release_wheel_sglang_x86_64.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/release_wheel_sglang_x86_64.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/release_wheel_x86_64.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/release_wheel_x86_64.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 1 deletion b/‎README.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎csrc/group_gemm_mxfp4_groupwise_sm100.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/group_gemm_mxfp4_groupwise_sm100.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h‎
Lines changed: 1 addition & 0 deletions b/‎csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp‎
Lines changed: 59 additions & 0 deletions b/‎csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h‎
Lines changed: 6 additions & 1 deletion b/‎csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h‎
Lines changed: 20 additions & 1 deletion b/‎csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎csrc/pytorch_extension_utils.h‎
Lines changed: 4 additions & 18 deletions b/‎csrc/pytorch_extension_utils.h‎
Lines changed: 4 additions & 18 deletions
@@ -16,7 +16,7 @@ on:
         required: true
 
 env:
-  TORCH_CUDA_ARCH_LIST: "7.5 8.0 8.9 9.0+PTX"
+  FLASHINFER_CUDA_ARCH_LIST: "7.5 8.0 8.9 9.0+PTX"
 
 jobs:
   build:
@@ -77,7 +77,7 @@ jobs:
               -e FLASHINFER_CI_CUDA_VERSION=${{ matrix.cuda }} \
               -e FLASHINFER_CI_TORCH_VERSION=${{ matrix.torch }} \
               -e FLASHINFER_CI_PYTHON_VERSION=${{ matrix.python }} \
-              -e TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \
+              -e FLASHINFER_CUDA_ARCH_LIST="$FLASHINFER_CUDA_ARCH_LIST" \
               -e MAX_JOBS=128 \
               --user $(id -u):$(id -g) \
               $BUILDER_IMAGE \
 
@@ -15,7 +15,7 @@ on:
         required: true
 
 env:
-  TORCH_CUDA_ARCH_LIST: "7.5 8.0 8.9 9.0+PTX"
+  FLASHINFER_CUDA_ARCH_LIST: "7.5 8.0 8.9 9.0+PTX"
 
 jobs:
   build:
@@ -59,7 +59,7 @@ jobs:
               -e FLASHINFER_CI_TORCH_VERSION=${{ matrix.torch }} \
               -e FLASHINFER_CI_PYTHON_VERSION=3.10 \
               -e FLASHINFER_HEAD_DIMS="64,128,256" \
-              -e TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \
+              -e FLASHINFER_CUDA_ARCH_LIST="$FLASHINFER_CUDA_ARCH_LIST" \
               -e MAX_JOBS=128 \
               --user $CI_UID:$CI_GID \
               $BUILDER_IMAGE \
 
@@ -18,7 +18,7 @@ on:
       #   required: true
 
 env:
-  TORCH_CUDA_ARCH_LIST: "7.5 8.0 8.9 9.0+PTX"
+  FLASHINFER_CUDA_ARCH_LIST: "7.5 8.0 8.9 9.0+PTX"
 
 jobs:
   build:
@@ -82,7 +82,7 @@ jobs:
               -e FLASHINFER_CI_CUDA_VERSION=${{ matrix.cuda }} \
               -e FLASHINFER_CI_TORCH_VERSION=${{ matrix.torch }} \
               -e FLASHINFER_CI_PYTHON_VERSION=3.10 \
-              -e TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \
+              -e FLASHINFER_CUDA_ARCH_LIST="$FLASHINFER_CUDA_ARCH_LIST" \
               -e MAX_JOBS=128 \
               --user $CI_UID:$CI_GID \
               $BUILDER_IMAGE \
 
@@ -68,7 +68,7 @@ To pre-compile essential kernels ahead-of-time (AOT), run the following command:
 
 ```bash
 # Set target CUDA architectures
-export TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a"
+export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a"
 # Build AOT kernels. Will produce AOT kernels in aot-ops/
 python -m flashinfer.aot
 # Build AOT wheel
@@ -124,6 +124,10 @@ Starting from FlashInfer v0.2, users can customize their own attention variants
 
 FlashInfer also provides C++ API and TVM bindings, please refer to [documentation](https://docs.flashinfer.ai/) for more details.
 
+## GPU Support
+
+FlashInfer currently provides support for NVIDIA SM architectures 80 and higher and beta support for 103, 110, 120, and 121.
+
 ## Adoption
 
 We are thrilled to share that FlashInfer is being adopted by many cutting-edge projects, including but not limited to:
 
@@ -134,7 +134,7 @@ void CutlassGroupGemmMXFP4GroupwiseScaledSM100(at::Tensor int_workspace_buffer,
                                                int64_t k, int64_t mma_sm, int64_t tile_m,
                                                int64_t tile_n, int64_t tile_k, bool swap_ab) {
   const c10::cuda::OptionalCUDAGuard device_guard(float_workspace_buffer.device());
-  auto stream = at::cuda::getCurrentCUDAStream();
+  auto stream = at::cuda::getCurrentCUDAStream(A.device().index());
   int num_groups = m_indptr.size(0) - 1;
   DISPATCH_PYTORCH_INPUT_OUTPUT_DTYPE(
       A.scalar_type(), B.scalar_type(), SFA.scalar_type(), SFB.scalar_type(), D.scalar_type(),
 
@@ -374,6 +374,7 @@ struct CutlassGemmConfig {
 
   int getTileConfigAsInt() const {
     if (sm_version == 120) return (int)tile_config_sm120;
+    if (sm_version == 110) return (int)tile_config_sm100;
     if (sm_version >= 100) return (int)tile_config_sm100;
     if (sm_version == 90) return (int)tile_config_sm90;
     if (sm_version < 90) return (int)tile_config_sm80;
 
@@ -415,6 +415,62 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(
 #endif
 }
 
+std::vector<CutlassGemmConfig> get_candidate_configs_sm110(
+    CutlassGemmConfig::CandidateConfigTypeParam const config) {
+#ifdef FAST_BUILD
+  // Fast build disables all configs except this
+  return {CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B,
+                            MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO,
+                            ClusterShape::ClusterShape_1x1x1}};
+#else
+  std::vector<CutlassGemmConfig> candidate_configs;
+  for (int cluster_m = 1; cluster_m <= 2; cluster_m++) {
+    bool Is2SM = cluster_m == 2;
+    for (int cluster_n = 1; cluster_n <= 2; cluster_n++) {
+      std::vector base = {// M=128
+                          CutlassTileConfigSM100::CtaShape128x128x128B,
+                          CutlassTileConfigSM100::CtaShape128x256x128B};
+
+      if (Is2SM) {
+        if (cluster_n == 1) {
+          base.push_back(CutlassTileConfigSM100::CtaShape128x64x128B);
+          base.push_back(CutlassTileConfigSM100::CtaShape256x64x128B);
+        }
+
+        std::vector twosm = {// M=256
+                             CutlassTileConfigSM100::CtaShape256x128x128B,
+                             CutlassTileConfigSM100::CtaShape256x256x128B};
+        std::copy(twosm.begin(), twosm.end(), std::back_inserter(base));
+      } else {
+        if (cluster_n == 1) {
+          base.push_back(CutlassTileConfigSM100::CtaShape128x32x128B);
+          if ((config & CutlassGemmConfig::FP8_ONLY) != 0) {
+            base.push_back(CutlassTileConfigSM100::CtaShape128x16x128B);
+          }
+        }
+
+        std::vector onesm{CutlassTileConfigSM100::CtaShape64x64x128B,
+                          CutlassTileConfigSM100::CtaShape64x128x128B,
+                          CutlassTileConfigSM100::CtaShape64x256x128B,
+                          CutlassTileConfigSM100::CtaShape128x64x128B};
+        std::copy(onesm.begin(), onesm.end(), std::back_inserter(base));
+      }
+
+      constexpr std::array cluster_shapes = {
+          std::array{ClusterShape::ClusterShape_1x1x1, ClusterShape::ClusterShape_1x2x1},
+          std::array{ClusterShape::ClusterShape_2x1x1, ClusterShape::ClusterShape_2x2x1}};
+      auto cluster = cluster_shapes[cluster_m - 1][cluster_n - 1];
+      for (auto tile : base) {
+        CutlassGemmConfig config{tile, MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO,
+                                 cluster};
+        candidate_configs.push_back(config);
+      }
+    }
+  }
+  return candidate_configs;
+#endif
+}
+
 std::vector<CutlassGemmConfig> get_candidate_configs_sm120(
     CutlassGemmConfig::CandidateConfigTypeParam const config) {
 #ifdef FAST_BUILD
@@ -478,6 +534,9 @@ std::vector<CutlassGemmConfig> get_candidate_configs(
   if (sm == 90 && (config_type_param & CutlassGemmConfig::HOPPER)) {
     return get_candidate_configs_sm90(config_type_param);
   }
+  if (sm == 110 && (config_type_param & CutlassGemmConfig::BLACKWELL)) {
+    return get_candidate_configs_sm110(config_type_param);
+  }
   if (sm >= 100 && sm < 120 && (config_type_param & CutlassGemmConfig::BLACKWELL)) {
     return get_candidate_configs_sm100(config_type_param);
   }
 
@@ -726,7 +726,12 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
       // We allow both tma warp specialized and SM80 configurations to coexist because for some
       // cases with small numbers of tokens SM80 is faster. We check here to see which is selected
       if (inputs.gemm_config.sm_version >= 90) {
-        TLLM_CHECK_WITH_INFO(inputs.gemm_config.sm_version == sm_,
+        bool is_same_sm = inputs.gemm_config.sm_version == sm_;
+        // gemm_config.sm_version indicates the kernel pipeline, which is always 100 for 100, 103,
+        // 110 below logging helps confirming the cutlass pipeline matches the device major version
+        bool is_sm110 = inputs.gemm_config.sm_version == 100 && sm_ == 110;
+        bool is_sm103 = inputs.gemm_config.sm_version == 100 && sm_ == 103;
+        TLLM_CHECK_WITH_INFO(is_same_sm || is_sm110 || is_sm103,
                              "Using SM %d configuration for SM %d device",
                              inputs.gemm_config.sm_version, sm_);
         TLLM_CHECK_WITH_INFO(inputs.biases != nullptr || hopper_inputs.ptr_c == nullptr,
 
@@ -327,7 +327,26 @@ void dispatchMoeGemmSelectTileShapeTmaWarpSpecialized(
     } else {
       TLLM_THROW("Unsupported SM90 configuration requested");
     }
-  } else if (gemm_config.sm_version >= 100 && gemm_config.sm_version < 120) {
+  } else if (gemm_config.sm_version == 110) {
+    if constexpr (kernels::cutlass_kernels::isValidBlackwellMOESpecialisation<
+                      T, WeightType, EpilogueTag, FUSION>()) {
+      switch (gemm_config.tile_config_sm100) {
+        SHAPE_CASE(100, 64, 64, 128)
+        SHAPE_CASE(100, 64, 128, 128)
+        SHAPE_CASE(100, 64, 256, 128)
+
+        SHAPE_CASE(100, 128, 16, 128)
+        SHAPE_CASE(100, 128, 32, 128)
+        SHAPE_CASE(100, 128, 64, 128)
+        SHAPE_CASE(100, 128, 128, 128)
+        SHAPE_CASE(100, 128, 256, 128)
+
+        DEFAULT_CASE(100)
+      }
+    } else {
+      TLLM_THROW("Unsupported SM110 configuration requested");
+    }
+  } else if (gemm_config.sm_version >= 100 && gemm_config.sm_version < 110) {
     if constexpr (kernels::cutlass_kernels::isValidBlackwellMOESpecialisation<
                       T, WeightType, EpilogueTag, FUSION>()) {
       switch (gemm_config.tile_config_sm100) {
 
@@ -146,40 +146,26 @@ FLASHINFER_EXT_MODULE_INIT_EXPAND(TORCH_EXTENSION_NAME)
 #endif
 
 // Should not be used together with _DISPATCH_SF_CASE_FP8_E8M0
-#ifdef FLASHINFER_ENABLE_FP4_E2M1
-#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
+#if defined(FLASHINFER_ENABLE_FP4_E2M1) && \
+    (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
 #define _DISPATCH_CASE_FP4_E2M1(c_type, ...) \
   case at::ScalarType::Byte: {               \
     using c_type = __nv_fp4_e2m1;            \
     return __VA_ARGS__();                    \
   }
 #else
-#define _DISPATCH_CASE_FP4_E2M1(c_type, ...)                               \
-  case at::ScalarType::Byte: {                                             \
-    static_assert(false, "FP4 E2M1 support requires CUDA 12.8 or newer."); \
-    break;                                                                 \
-  }
-#endif
-#else
 #define _DISPATCH_CASE_FP4_E2M1(c_type, ...)
 #endif
 
 // Should not be used together with _DISPATCH_CASE_FP4_E2M1
-#ifdef FLASHINFER_ENABLE_FP8_E8M0
-#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
+#if defined(FLASHINFER_ENABLE_FP8_E8M0) && \
+    (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
 #define _DISPATCH_SF_CASE_FP8_E8M0(c_type, ...) \
   case at::ScalarType::Byte: {                  \
     using c_type = __nv_fp8_e8m0;               \
     return __VA_ARGS__();                       \
   }
 #else
-#define _DISPATCH_SF_CASE_FP8_E8M0(c_type, ...)                            \
-  case at::ScalarType::Byte: {                                             \
-    static_assert(false, "FP8 E8M0 support requires CUDA 12.8 or newer."); \
-    break;                                                                 \
-  }
-#endif
-#else
 #define _DISPATCH_SF_CASE_FP8_E8M0(c_type, ...)
 #endif