perfix: use lightweight API to query device property (#1298)

azhurkevich · web-flow · commit 3f99f1867051 · 2025-07-21T23:20:55.000-07:00
## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -56,11 +56,17 @@ at::Tensor trtllm_fp8_per_tensor_scale_moe_launcher(
     bool const use_routing_scales_on_input, int64_t const tile_tokens_dim,
     int64_t const routing_method_type) {
   auto device = hidden_states.device();
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, device.index());
-  TORCH_CHECK(prop.major == 10 && prop.minor == 0,
-              "This kernel requires SM 100 architecture. Current device has SM ", prop.major,
-              prop.minor, " (", prop.name, ")");
+
+  static const std::tuple<int, int> device_props = [&device] {
+    int major, minor;
+    cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device.index());
+    cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device.index());
+    return std::make_tuple(major, minor);
+  }();
+
+  TORCH_CHECK(std::get<0>(device_props) == 10 && std::get<1>(device_props) == 0,
+              "This kernel requires SM 100 architecture. Current device has SM ",
+              std::get<0>(device_props), std::get<1>(device_props));
 
   if (use_routing_scales_on_input) {
     TORCH_CHECK(routing_logits.scalar_type() == at::ScalarType::BFloat16,
@@ -313,11 +319,17 @@ at::Tensor trtllm_fp8_block_scale_moe_launcher(
     tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::Runner& moe_runner,
     int64_t moeConfigIndex) {
   auto device = hidden_states.device();
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, device.index());
-  TORCH_CHECK(prop.major == 10 && prop.minor == 0,
-              "This kernel requires SM 100 architecture. Current device has SM ", prop.major,
-              prop.minor, " (", prop.name, ")");
+
+  static const std::tuple<int, int> device_props = [&device] {
+    int major, minor;
+    cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device.index());
+    cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device.index());
+    return std::make_tuple(major, minor);
+  }();
+
+  TORCH_CHECK(std::get<0>(device_props) == 10 && std::get<1>(device_props) == 0,
+              "This kernel requires SM 100 architecture. Current device has SM ",
+              std::get<0>(device_props), std::get<1>(device_props));
 
   TORCH_CHECK(routing_logits.scalar_type() == at::ScalarType::Float,
               "routing_logits must be float.");
@@ -593,11 +605,17 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
     tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::Runner& moe_runner,
     int64_t const moeConfigIndex) {
   auto device = hidden_states.device();
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, device.index());
-  TORCH_CHECK(prop.major == 10 && prop.minor == 0,
-              "This kernel requires SM 100 architecture. Current device has SM ", prop.major,
-              prop.minor, " (", prop.name, ")");
+
+  static const std::tuple<int, int> device_props = [&device] {
+    int major, minor;
+    cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device.index());
+    cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device.index());
+    return std::make_tuple(major, minor);
+  }();
+
+  TORCH_CHECK(std::get<0>(device_props) == 10 && std::get<1>(device_props) == 0,
+              "This kernel requires SM 100 architecture. Current device has SM ",
+              std::get<0>(device_props), std::get<1>(device_props));
 
   TORCH_CHECK(tile_tokens_dim == 8 || tile_tokens_dim == 16 || tile_tokens_dim == 32 ||
                   tile_tokens_dim == 64,