discover the number of SMs using cuda runtime (#14)

Amir-19 · web-flow · commit d480977ecc01 · 2025-05-01T09:37:30.000-07:00
we can discover the number of SMs using `cuda_runtime`
diff --git a/csrc/all_to_all/CMakeLists.txt b/csrc/all_to_all/CMakeLists.txt
@@ -4,6 +4,10 @@ add_library(all_to_all_common STATIC
     all_to_all.cpp
 )
 
+target_link_libraries(all_to_all_common PUBLIC
+    CUDA::cudart
+)
+
 add_library(all_to_all_intranode_lib STATIC
     intranode_combine.cu
     intranode_dispatch.cu
diff --git a/csrc/all_to_all/all_to_all.cpp b/csrc/all_to_all/all_to_all.cpp
@@ -1,5 +1,6 @@
 #include "all_to_all.h"
 
+#include "core/cuda_utils.h"
 #include "core/utils.h"
 
 using namespace pplx;
@@ -25,7 +26,8 @@ AllToAll::AllToAll(
       hiddenDimScaleBytes(hiddenDimScaleBytes),
       rank(rank),
       worldSize(worldSize),
-      dpSize(dpSize) {
+      dpSize(dpSize),
+      numSMs(get_sm_count()) {
 
   ROSE_ASSERT(hiddenDimBytes % 16 == 0, "invalid hidden dim bytes");
   ROSE_ASSERT(hiddenDimScaleBytes % 16 == 0, "invalid hidden dim scale bytes");
diff --git a/csrc/all_to_all/all_to_all.h b/csrc/all_to_all/all_to_all.h
@@ -63,6 +63,8 @@ class AllToAll {
   const unsigned worldSize;
   /// The size of a DP group.
   const unsigned dpSize;
+  /// The number of streaming multiprocessors (SMs) on the device.
+  const int numSMs;
 };
 
 } // namespace pplx
diff --git a/csrc/all_to_all/internode_combine.cu b/csrc/all_to_all/internode_combine.cu
@@ -162,7 +162,7 @@ void AllToAllInterNode::combine(
   const size_t numLocalExperts = numExperts / worldSize;
   const size_t numDPGroups = worldSize / dpSize;
   const size_t batchNumTokens = numLocalExperts * numDPGroups * maxNumTokens;
-  const size_t numBlocks = std::min(132ul, batchNumTokens);
+  const size_t numBlocks = std::min(static_cast<size_t>(numSMs), batchNumTokens);
 
   assert(hiddenDimBytes % 16 == 0);
 
diff --git a/csrc/all_to_all/internode_dispatch.cu b/csrc/all_to_all/internode_dispatch.cu
@@ -270,7 +270,7 @@ void AllToAllInterNode::dispatch(
       std::max(
           ceil_div<unsigned>(numExperts, NUM_WARPS), (unsigned)(maxNumTokens * expertsPerToken)
       ),
-      132u
+      static_cast<unsigned>(numSMs)
   );
   dim3 dimGrid(numBlocks, 1, 1);
   dim3 dimBlock(NUM_WARPS * 32, 1, 1);
diff --git a/csrc/all_to_all/intranode_combine.cu b/csrc/all_to_all/intranode_combine.cu
@@ -178,7 +178,7 @@ void AllToAllIntraNode::combine(
   const size_t numLocalExperts = numExperts / worldSize;
   const size_t numDPGroups = worldSize / dpSize;
   const size_t batchNumTokens = numLocalExperts * numDPGroups * maxNumTokens;
-  const size_t numBlocks = std::min(132ul, batchNumTokens);
+  const size_t numBlocks = std::min(static_cast<size_t>(numSMs), batchNumTokens);
 
   assert(hiddenDimBytes % 16 == 0);
 
diff --git a/csrc/all_to_all/intranode_dispatch.cu b/csrc/all_to_all/intranode_dispatch.cu
@@ -278,7 +278,7 @@ void AllToAllIntraNode::dispatch(
       std::max(
           ceil_div<unsigned>(numExperts, NUM_WARPS), (unsigned)(maxNumTokens * expertsPerToken)
       ),
-      132u
+      static_cast<unsigned>(numSMs)
   );
   dim3 dimGrid(numBlocks, 1, 1);
   dim3 dimBlock(NUM_WARPS * 32, 1, 1);
diff --git a/csrc/core/cuda_utils.h b/csrc/core/cuda_utils.h
@@ -20,4 +20,14 @@ template <typename T> T *mallocZeroBuffer(size_t size) {
   cudaMemset(ptr, 0, size * sizeof(T));
   return ptr;
 }
+
+inline int get_sm_count() {
+  int device;
+  CUDACHECK(cudaGetDevice(&device));
+  int numSMs;
+  CUDACHECK(cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, device));
+
+  return numSMs;
+}
+
 } // namespace pplx