Revert "Adds snapshot API for MemPools to get pool memory segments (pytorch#133601)"

pytorchmergebot · pytorchmergebot · commit 3b0f39336c88 · 2024-10-28T15:12:20.000Z
This reverts commit 00504aa. Reverted pytorch#133601 on behalf of https://github.com/wdvr due to reverting for now as this breaks lots of internal tests. Details below ([comment](pytorch#133601 (comment)))
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
@@ -1897,41 +1897,16 @@ class DeviceCachingAllocator {
 
     std::unordered_map<PrivatePool*, MempoolId_t> pool_to_id;
     pool_to_id.reserve(graph_pools.size() + graph_pools_freeable.size());
-    std::vector<Block*> all_blocks;
-    MempoolId_t mempool_id = {0, 0};
-
-    auto active_mempool = MemPoolContext::getActiveMemPool();
-    if (active_mempool) {
-      mempool_id = active_mempool->id();
+    for (const auto& pair : graph_pools) {
+      pool_to_id[pair.second.get()] = pair.first;
     }
-
-    if (mempool_id.first != 0 || mempool_id.second != 0) {
-      // If there is an active mempool, we find the corresponding PrivatePool
-      // in graph_pools and only return the blocks from it.
-      auto pool = graph_pools.find(mempool_id);
-      if (pool != graph_pools.end()) {
-        pool_to_id[pool->second.get()] = pool->first;
-        all_blocks = get_private_pool_head_blocks(pool->second.get());
-      }
-      auto pool_freeable = graph_pools_freeable.find(mempool_id);
-      if (pool_freeable != graph_pools_freeable.end()) {
-        pool_to_id[pool_freeable->second] = pool_freeable->first;
-      }
-    } else {
-      // When snapshot is called outside a MemPoolContext, we return
-      // all the blocks in the CUDACachingAllocator (as returned by
-      // get_all_blocks).
-      for (const auto& pair : graph_pools) {
-        pool_to_id[pair.second.get()] = pair.first;
-      }
-      for (const auto& pair : graph_pools_freeable) {
-        pool_to_id[pair.second] = pair.first;
-      }
-      all_blocks = get_all_blocks();
+    for (const auto& pair : graph_pools_freeable) {
+      pool_to_id[pair.second] = pair.first;
     }
 
     size_t total_active = 0;
     std::vector<SegmentInfo> result;
+    const auto all_blocks = get_all_blocks();
 
     for (const Block* const head_block : all_blocks) {
       // For expandable segments, we report one segment for each contiguous
@@ -2134,8 +2109,8 @@ class DeviceCachingAllocator {
  private:
   // All private methods do not acquire the allocator mutex.
 
-  std::vector<Block*> get_all_blocks() const {
-    std::vector<Block*> blocks;
+  std::vector<const Block*> get_all_blocks() const {
+    std::vector<const Block*> blocks;
     blocks.insert(
         blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
     blocks.insert(
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -4542,10 +4542,9 @@ def test_mempool_with_allocator(self):
         alloc_lib = ctypes.CDLL(dummy_allocator)
         called_dummy_alloc = ctypes.c_int.in_dll(alloc_lib, "called_dummy_alloc")
         self.assertEqual(called_dummy_alloc.value, 0)
-        nelem_1mb = 1024 * 1024 // 4
 
         with torch.cuda.use_mem_pool(pool):
-            out_0 = torch.randn(nelem_1mb, device="cuda")
+            out_0 = torch.randn(1, device="cuda")
 
             # pool's use count should be 2 at this point as use_mem_pool
             # holds a reference
@@ -4559,23 +4558,6 @@ def test_mempool_with_allocator(self):
         # out tensor
         self.assertEqual(called_dummy_alloc.value, 123)
 
-        with torch.cuda.use_mem_pool(pool):
-            # pool should have 1 segment since we made a small allocation (1 MB)
-            # above and so the CUDACachingAllocator packed it into a 2 MB buffer
-            self.assertEqual(len(pool.snapshot()), 1)
-
-            out_1 = torch.randn(nelem_1mb, device="cuda")
-
-            # pool should still have 1 segment since we made another small allocation
-            # (1 MB) that got packed into the existing 2 MB buffer
-            self.assertEqual(len(pool.snapshot()), 1)
-
-            out_2 = torch.randn(nelem_1mb, device="cuda")
-
-            # pool now should have 2 segments since the CUDACachingAllocator had
-            # to make a new 2 MB buffer to accomodate out_2
-            self.assertEqual(len(pool.snapshot()), 2)
-
     def test_mempool_context(self):
         active_pool = torch.cuda.MemPoolContext.active_pool()
 
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
@@ -980,25 +980,6 @@ def _get_current_allocator() -> _CUDAAllocator:
     return _CUDAAllocator(torch._C._cuda_getAllocator())
 
 
-class MemPoolContext(_MemPoolContext):
-    r"""MemPoolContext holds the currently active pool and stashes the previous
-    pool. On deletion it makes the previous pool active.
-
-    Args:
-        pool(torch.cuda.MemPool): a MemPool object to be made active so that
-        allocations route to this pool.
-
-    """
-
-    def __init__(self, pool: _MemPool):
-        super().__init__(pool)
-
-    @staticmethod
-    def active_pool() -> Optional[_MemPool]:
-        r"""Returns the active MemPool"""
-        return _MemPoolContext.active_pool()
-
-
 class MemPool(_MemPool):
     r"""MemPool represents a pool of memory in a caching allocator. Currently,
     it's just the ID of the pool object maintained in the CUDACachingAllocator.
@@ -1029,23 +1010,24 @@ def use_count(self) -> int:
         r"""Returns the reference count of this pool."""
         return super().use_count()
 
-    def snapshot(self):
-        r"""Return a snapshot of the CUDA memory allocator pool state across all
-        devices.
 
-        Interpreting the output of this function requires familiarity with the
-        memory allocator internals.
+class MemPoolContext(_MemPoolContext):
+    r"""MemPoolContext holds the currently active pool and stashes the previous
+    pool. On deletion it makes the previous pool active.
 
-        .. note::
-            See :ref:`cuda-memory-management` for more details about GPU memory
-            management.
-        """
-        try:
-            ctx = MemPoolContext(self)
-            snapshot = torch.cuda.memory_snapshot()
-        finally:
-            del ctx
-        return snapshot
+    Args:
+        pool(torch.cuda.MemPool): a MemPool object to be made active so that
+        allocations route to this pool.
+
+    """
+
+    def __init__(self, pool: MemPool):
+        super().__init__(pool)
+
+    @staticmethod
+    def active_pool() -> Optional[_MemPool]:
+        r"""Returns the active MemPool"""
+        return _MemPoolContext.active_pool()
 
 
 @contextlib.contextmanager