[CUDA] revert PR 130472 (pytorch#163379)

pytorchbot · thenumberouscode · web-flow · commit 22d46b50ec48 · 2025-09-29T16:05:26.000-04:00
[CUDA] revert PR 130472 (pytorch#162950) This change may also resolve pytorch#161789, though verification is still needed. PR pytorch#130472 would introduced the problem of freeing the same address without clean metadata. according to the below discussion, reverted it. Pull Request resolved: pytorch#162950 Approved by: https://github.com/ngimel, https://github.com/eqy, https://github.com/syed-ahmed (cherry picked from commit 4a160da) Co-authored-by: thenumberouscode <dream20151224@163.com>
diff --git a/aten/src/ATen/test/cuda_allocator_test.cpp b/aten/src/ATen/test/cuda_allocator_test.cpp
@@ -5,51 +5,6 @@
 
 #include <ATen/test/allocator_clone_test.h>
 
-#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
-
 TEST(AllocatorTestCUDA, test_clone) {
   test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
-
-static int called_dummy_free_0 = 0;
-static int called_dummy_free_1 = 0;
-
-void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
-void dummy_free_0(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_0++;
-}
-void dummy_free_1(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_1++;
-}
-
-// Tests that data_ptrs have their respective deleters
-// when mixing allocators
-TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
-  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
-  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
-  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
-  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
-  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Manually use a's deleter
-  auto* ctx = a.storage().data_ptr().get_context();
-  a.storage().data_ptr().get_deleter()(ctx);
-  a.storage().mutable_data_ptr().release_context();
-
-  // a's deleter is dummy_free_0
-  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
-  ASSERT_TRUE(called_dummy_free_0 == 1);
-
-  // Manually use b's deleter
-  ctx = b.storage().data_ptr().get_context();
-  b.storage().data_ptr().get_deleter()(ctx);
-  b.storage().mutable_data_ptr().release_context();
-
-  // b's deleter is dummy_free_1
-  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
-  ASSERT_TRUE(called_dummy_free_1 == 1);
-}
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -7,23 +7,6 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
-CUDAPluggableAllocatorDeleterContext::CUDAPluggableAllocatorDeleterContext(
-    std::function<FreeFuncType> free_fn,
-    void* data,
-    size_t size,
-    int device,
-    cudaStream_t stream)
-    : free_fn_(std::move(free_fn)),
-      data_(data),
-      size_(size),
-      device_(device),
-      stream_(stream) {}
-
-void CUDAPluggableAllocatorDeleterContext::free() {
-  free_fn_(data_, size_, device_, stream_);
-  delete this;
-}
-
 int device_count = 0;
 
 void custom_raw_deleter(void* ptr);
@@ -41,8 +24,8 @@ _AllocationMetadata::_AllocationMetadata(
 // This avoids having to link against libtorch for C++ based custom allocators
 // And also use this from python
 CUDAPluggableAllocator::CUDAPluggableAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn)
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn)
     : alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}
 
 CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
@@ -114,10 +97,8 @@ c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
   C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
   void* r = this->malloc(size, device, stream);
-  auto* ctx = new CUDAPluggableAllocatorDeleterContext(
-      free_fn_, r, size, device, stream);
   c10::DataPtr data_ptr = {
-      r, ctx, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
+      r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
   return data_ptr;
 }
 
@@ -382,8 +363,8 @@ getCurrentAllocator() {
 // TODO: add more functions in the argument
 std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn) {
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn) {
   std::shared_ptr<CUDAPluggableAllocator> allocator(
       new CUDAPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
   allocator->init(device_count);
@@ -400,8 +381,8 @@ void changeCurrentAllocator(
   current_custom_allocator = allocator;
 }
 
-void custom_raw_deleter(void* ctx) {
-  reinterpret_cast<CUDAPluggableAllocatorDeleterContext*>(ctx)->free();
+void custom_raw_deleter(void* ptr) {
+  current_custom_allocator->raw_delete(ptr);
 }
 
 } // namespace torch::cuda::CUDAPluggableAllocator
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -11,32 +11,6 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
-using MallocFuncType = void*(size_t, int, cudaStream_t);
-using FreeFuncType = void(void*, size_t, int, cudaStream_t);
-
-// A CUDAPluggableAllocatorDeleterContext object is used as the `ctx`
-// argument for DataPtr. We need context because a user can use
-// multiple allocators in the same PyTorch program, and
-// the allocators can have different free functions, such as:
-// free, cudaFree, cudaFreeAsync, ncclMemFree etc.
-struct TORCH_CUDA_CPP_API CUDAPluggableAllocatorDeleterContext {
-  explicit CUDAPluggableAllocatorDeleterContext(
-      std::function<FreeFuncType> free_fn,
-      void* data,
-      size_t size,
-      int device,
-      cudaStream_t stream);
-
-  void free();
-
- private:
-  std::function<FreeFuncType> free_fn_;
-  void* data_;
-  size_t size_;
-  int device_;
-  cudaStream_t stream_{};
-};
-
 #if defined(USE_ROCM)
 using streamType = c10::hip::HIPStream;
 #else
@@ -49,8 +23,8 @@ getCurrentAllocator();
 TORCH_CUDA_CPP_API std::shared_ptr<
     c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn);
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
 TORCH_CUDA_CPP_API void changeCurrentAllocator(
     const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
         allocator);
@@ -69,8 +43,8 @@ struct _AllocationMetadata {
 struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
     : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
   CUDAPluggableAllocator(
-      std::function<MallocFuncType> alloc_fn,
-      std::function<FreeFuncType> free_fn);
+      std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+      std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
 
   CUDAPluggableAllocator(CUDAPluggableAllocator& other);
   CUDAPluggableAllocator(CUDAPluggableAllocator&& other) = delete;
@@ -173,8 +147,8 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
   void copy_data(void* dest, const void* src, std::size_t count) const final;
 
  protected:
-  std::function<MallocFuncType> alloc_fn_;
-  std::function<FreeFuncType> free_fn_;
+  std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
+  std::function<void(void*, size_t, int, cudaStream_t)> free_fn_;
   std::function<void(int)> init_fn_;
   std::function<void()> reset_fn_;
   std::function<void(double, int)> memory_fraction_fn_;
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
@@ -1274,14 +1274,16 @@ static void registerCudaPluggableAllocator(PyObject* module) {
             self.set_release_pool(func);
           });
   m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
-    using namespace torch::cuda::CUDAPluggableAllocator;
+    using MallocFuncType = void*(size_t, int, cudaStream_t);
+    using FreeFuncType = void(void*, size_t, int, cudaStream_t);
     std::function<MallocFuncType> malloc_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<MallocFuncType*>(malloc_ptr);
     std::function<FreeFuncType> free_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<FreeFuncType*>(free_ptr);
-    return createCustomAllocator(malloc_fn, free_fn);
+    return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
+        malloc_fn, free_fn);
   });
 
   // NOLINTNEXTLINE(bugprone-unused-raii)