Move cudaGetDeviceProperties code to util (#4838)

cthi · facebook-github-bot · commit 210347cb04b5 · 2025-09-09T15:15:18.000-07:00
Summary: Pull Request resolved: #4838 X-link: facebookresearch/FBGEMM#1864 We should move this into it's own function and reuse it instead of copy paste. Also return the actual arch from `prop.major`directly. Reviewed By: q10 Differential Revision: D81963785 fbshipit-source-id: d65cadeb65300c2ddfcc45f40afa9053f6763308
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include/fbgemm_gpu/quantize/utils.h b/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include/fbgemm_gpu/quantize/utils.h
@@ -19,4 +19,6 @@ constexpr int64_t nextPowerOf2(int64_t num) {
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
 
+int getDeviceArch();
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/common/utils.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/common/utils.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fbgemm_gpu/quantize/utils.h" // @manual
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAException.h>
+#include <cuda_runtime.h>
+
+namespace fbgemm_gpu {
+
+int getDeviceArch() {
+  static int arch = []() {
+    // Avoid expensive cudaGetDeviceProperties call.
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+
+    if (prop.major >= 10) {
+      int runtimeVersion = 0;
+      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
+      TORCH_CHECK(
+          runtimeVersion >= 12080, "SM100a+ kernels require cuda >= 12.8");
+    }
+
+    return prop.major;
+  }();
+  return arch;
+}
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/bf16bf16bf16_grouped.cu
@@ -207,22 +207,7 @@ at::Tensor dispatch_bf16_grouped_kernel(
     at::Tensor output,
     std::optional<at::Tensor> zero_start_index_M = std::nullopt,
     std::optional<at::Tensor> M_sizes = std::nullopt) {
-  static int arch = -1;
-  // Avoid expensive cudaGetDeviceProperties call.
-  if (arch < 0) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, 0);
-    if (prop.major >= 10) {
-      arch = 10;
-      int runtimeVersion;
-      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
-      TORCH_CHECK(
-          runtimeVersion >= 12080,
-          "FP8 grouped GEMM on sm100a or above requires cuda >= 12.8");
-    } else {
-      arch = 9;
-    }
-  }
+  const int arch = getDeviceArch();
 
   // Select kernel to run via heuristics or tuning.
   auto kernel = [&]() {
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_groupwise.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_groupwise.cu
@@ -9,7 +9,6 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
-// clang-format on
 
 #include "f8f8bf16_groupwise/f8f8bf16_groupwise_manifest.cuh"
 #include "fbgemm_gpu/quantize/tuning_cache.hpp"
@@ -64,22 +63,7 @@ at::Tensor dispatch_fp8_groupwise_kernel(
   int N = size_to_dim_(WQ.dim() - 1, WQ.sizes());
   int K = XQ.size(-1);
 
-  static int arch = -1;
-  // Avoid expensive cudaGetDeviceProperties call.
-  if (arch < 0) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, 0);
-    if (prop.major >= 10) {
-      arch = 10;
-      int runtimeVersion;
-      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
-      TORCH_CHECK(
-          runtimeVersion >= 12080,
-          "FP8 GEMM on sm100a or above requires cuda >= 12.8");
-    } else {
-      arch = 9;
-    }
-  }
+  const int arch = getDeviceArch();
 
   // Select kernel to run via heuristics or tuning.
   auto kernel = [&]() {
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_batched.cu
@@ -10,6 +10,8 @@
 #include <cute/tensor.hpp>
 #include "f8f8bf16_rowwise_batched/f8f8bf16_rowwise_batched_manifest.cuh"
 
+#include "fbgemm_gpu/quantize/utils.h"
+
 namespace fbgemm_gpu {
 
 #if CUDART_VERSION >= 12000
@@ -30,22 +32,7 @@ at::Tensor dispatch_fp8_rowwise_batched_kernel(
     bool use_fast_accum = true,
     std::optional<at::Tensor> bias = std::nullopt,
     std::optional<at::Tensor> output = std::nullopt) {
-  static int arch = -1;
-  // Avoid expensive cudaGetDeviceProperties call.
-  if (arch < 0) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, 0);
-    if (prop.major >= 10) {
-      arch = 10;
-      int runtimeVersion;
-      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
-      TORCH_CHECK(
-          runtimeVersion >= 12080,
-          "FP8 batched GEMM on sm100a or above requires cuda >= 12.8");
-    } else {
-      arch = 9;
-    }
-  }
+  const int arch = getDeviceArch();
 
   TORCH_CHECK(
       (XQ.dim() == 3 && WQ.dim() == 3),
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise_grouped.cu
@@ -9,7 +9,6 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
-// clang-format on
 
 #include "f8f8bf16_rowwise_grouped/f8f8bf16_rowwise_grouped_manifest.cuh"
 #include "f8f8bf16_rowwise_grouped_sm100/f8f8bf16_rowwise_grouped_manifest.cuh"
@@ -32,22 +31,7 @@ TuningCache& getTuningCache() {
 template <typename InputType>
 Kernel_f8f8bf16_rowwise_grouped<InputType>
 get_kernel_via_heuristics(int total_M, int max_N, int max_K, int G) {
-  static int arch = -1;
-  // Avoid expensive cudaGetDeviceProperties call.
-  if (arch < 0) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, 0);
-    if (prop.major >= 10) {
-      arch = 10;
-      int runtimeVersion;
-      C10_CUDA_CHECK(cudaRuntimeGetVersion(&runtimeVersion));
-      TORCH_CHECK(
-          runtimeVersion >= 12080,
-          "FP8 grouped GEMM on sm100a or above requires cuda >= 12.8");
-    } else {
-      arch = 9;
-    }
-  }
+  const int arch = getDeviceArch();
 
   // Use heuristics to pick the best kernel implementation.
   if (arch == 10) {

Original file line number	Diff line number	Diff line change
`@@ -19,4 +19,6 @@ constexpr int64_t nextPowerOf2(int64_t num) {`
`19`	`19`	`return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));`
`20`	`20`	`}`
`21`	`21`
	`22`	`+int getDeviceArch();`
	`23`	`+`
`22`	`24`	`} // namespace fbgemm_gpu`