[CUDA] Use runtime driver API for cuStreamWriteValue32 (pytorch#158585)

pytorchbot · eee4017 · nWEIdia · web-flow · commit f2b69a083d15 · 2025-07-18T10:45:13.000-04:00
[CUDA] Use runtime driver API for cuStreamWriteValue32 (pytorch#158295) Reopen pytorch#156097 Fixes pytorch#154073 Reference: NVIDIA/Fuser#4197 See PR pytorch#156097 and pytorch#154097 Pull Request resolved: pytorch#158295 Approved by: https://github.com/Skylion007, https://github.com/ngimel, https://github.com/eqy, https://github.com/huydhn (cherry picked from commit a9f902a) Co-authored-by: Frank Lin <eee4017@gmail.com> Co-authored-by: Wei Wang <weiwan@nvidia.com>
diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
@@ -1,30 +1,35 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/driver_api.h>
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+#include <cuda_runtime.h>
 #include <dlfcn.h>
 
 namespace c10::cuda {
 
 namespace {
 
+void* get_symbol(const char* name, int version);
+
 DriverAPI create_driver_api() {
-  void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
-  TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
   void* handle_1 = DriverAPI::get_nvml_handle();
   DriverAPI r{};
 
-#define LOOKUP_LIBCUDA_ENTRY(name)                       \
-  r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
-  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name, ": ", dlerror())
-  C10_LIBCUDA_DRIVER_API(LOOKUP_LIBCUDA_ENTRY)
-#undef LOOKUP_LIBCUDA_ENTRY
+#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED(name, version)            \
+  r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version)); \
+  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name);
+  C10_LIBCUDA_DRIVER_API_REQUIRED(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED)
+#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED
 
-#define LOOKUP_LIBCUDA_ENTRY(name)                       \
-  r.name##_ = ((decltype(&name))dlsym(handle_0, #name)); \
-  dlerror();
-  C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
-#undef LOOKUP_LIBCUDA_ENTRY
+// Users running drivers between 12.0 and 12.3 will not have these symbols,
+// they would be resolved into nullptr, but we guard their usage at runtime
+// to ensure safe fallback behavior.
+#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL(name, version) \
+  r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version));
+  C10_LIBCUDA_DRIVER_API_OPTIONAL(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL)
+#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL
 
   if (handle_1) {
 #define LOOKUP_NVML_ENTRY(name)                          \
@@ -35,6 +40,32 @@ DriverAPI create_driver_api() {
   }
   return r;
 }
+
+void* get_symbol(const char* name, int version) {
+  void* out = nullptr;
+  cudaDriverEntryPointQueryResult qres{};
+
+  // CUDA 12.5+ supports version-based lookup
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12050)
+  if (auto st = cudaGetDriverEntryPointByVersion(
+          name, &out, version, cudaEnableDefault, &qres);
+      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
+    return out;
+  }
+#endif
+
+  // This fallback to the old API to try getting the symbol again.
+  if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
+      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
+    return out;
+  }
+
+  // If the symbol cannot be resolved, report and return nullptr;
+  // the caller is responsible for checking the pointer.
+  LOG(INFO) << "Failed to resolve symbol " << name;
+  return nullptr;
+}
+
 } // namespace
 
 void* DriverAPI::get_nvml_handle() {
diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h
@@ -20,29 +20,42 @@
     }                                                                      \
   } while (0)
 
-#define C10_LIBCUDA_DRIVER_API(_)   \
-  _(cuDeviceGetAttribute)           \
-  _(cuMemAddressReserve)            \
-  _(cuMemRelease)                   \
-  _(cuMemMap)                       \
-  _(cuMemAddressFree)               \
-  _(cuMemSetAccess)                 \
-  _(cuMemUnmap)                     \
-  _(cuMemCreate)                    \
-  _(cuMemGetAllocationGranularity)  \
-  _(cuMemExportToShareableHandle)   \
-  _(cuMemImportFromShareableHandle) \
-  _(cuMemsetD32Async)               \
-  _(cuStreamWriteValue32)           \
-  _(cuGetErrorString)
+// The integer in the second column specifies the requested CUDA Driver API
+// version. The dynamic loader will accept a driver with a newer version, but it
+// ensures that the requested symbol exists in *at least* the specified version
+// or earlier.
+
+// Keep these requested versions as low as possible to maximize compatibility
+// across different driver versions.
+
+// Why do we pin to an older version instead of using the latest?
+// If a user installs a newer driver, blindly resolving the symbol may bind to a
+// newer version of the function with different behavior, potentially breaking
+// PyTorch.
+
+#define C10_LIBCUDA_DRIVER_API_REQUIRED(_) \
+  _(cuDeviceGetAttribute, 12000)           \
+  _(cuMemAddressReserve, 12000)            \
+  _(cuMemRelease, 12000)                   \
+  _(cuMemMap, 12000)                       \
+  _(cuMemAddressFree, 12000)               \
+  _(cuMemSetAccess, 12000)                 \
+  _(cuMemUnmap, 12000)                     \
+  _(cuMemCreate, 12000)                    \
+  _(cuMemGetAllocationGranularity, 12000)  \
+  _(cuMemExportToShareableHandle, 12000)   \
+  _(cuMemImportFromShareableHandle, 12000) \
+  _(cuMemsetD32Async, 12000)               \
+  _(cuStreamWriteValue32, 12000)           \
+  _(cuGetErrorString, 12000)
 
 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
-#define C10_LIBCUDA_DRIVER_API_12030(_) \
-  _(cuMulticastAddDevice)               \
-  _(cuMulticastBindMem)                 \
-  _(cuMulticastCreate)
+#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
+  _(cuMulticastAddDevice, 12030)           \
+  _(cuMulticastBindMem, 12030)             \
+  _(cuMulticastCreate, 12030)
 #else
-#define C10_LIBCUDA_DRIVER_API_12030(_)
+#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_)
 #endif
 
 #define C10_NVML_DRIVER_API(_)            \
@@ -56,11 +69,14 @@
 namespace c10::cuda {
 
 struct DriverAPI {
+#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
 #define CREATE_MEMBER(name) decltype(&name) name##_;
-  C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
-  C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
+  C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED)
+  C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED)
   C10_NVML_DRIVER_API(CREATE_MEMBER)
+#undef CREATE_MEMBER_VERSIONED
 #undef CREATE_MEMBER
+
   static DriverAPI* get();
   static void* get_nvml_handle();
 };
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
@@ -1078,10 +1078,6 @@ class SymmMemSingleProcTest(TestCase):
         not TEST_WITH_ROCM and _get_torch_cuda_version() < (12, 0),
         "stream_write_value32 currently only supports cuda version>=12.0",
     )
-    @skipIf(
-        _get_torch_cuda_version() >= (12, 6),
-        "https://github.com/pytorch/pytorch/issues/154073",
-    )
     @runOnRocmArch(MI300_ARCH)
     def test_stream_write_value32(self):
         tensor = torch.zeros(4, dtype=torch.uint32, device="cuda")