【Allocator】Update stategy of Tryalloc and AllocatorVisitor (PaddlePaddle#76523)

liuruyan · web-flow · commit d24c763f4dd8 · 2025-11-24T14:10:37.000+08:00
* update_stategy

* update strategy
diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
@@ -2318,7 +2318,7 @@ PHI_DEFINE_EXPORTED_bool(use_accuracy_compatible_kernel,
 /**
  * Allocator Compact related FLAG
  * Name: FLAGS_enable_compact_mem
- * Since Version: 3.2.2
+ * Since Version: 3.3
  * Value Range: bool, default=false
  * Example:
  * Note: whether start compact memory.
@@ -2329,7 +2329,7 @@ PHI_DEFINE_EXPORTED_bool(enable_compact_mem,
 /**
  * Allocator Compact related FLAG
  * Name: FLAGS_max_reserved_threshold_in_gb
- * Since Version: 3.2.2
+ * Since Version: 3.3
  * Value Range: int64, default=70
  * Example:
  * Note: Threshold (GB) used in compact memory. Only reserved_mem greater than
@@ -2344,7 +2344,7 @@ PHI_DEFINE_EXPORTED_int64(
 /**
  * Allocator Compact related FLAG
  * Name: FLAGS_cur_allocated_threshold_in_gb
- * Since Version: 3.2.2
+ * Since Version: 3.3
  * Value Range: int64, default=70
  * Example:
  * Note: Threshold (GB) used in compact memory. Only reserved_mem greater than
@@ -2359,7 +2359,7 @@ PHI_DEFINE_EXPORTED_int64(
 /**
  * Allocator Compact related FLAG
  * Name: FLAGS_try_allocate
- * Since Version: 3.2.2
+ * Since Version: 3.3
  * Value Range: bool, default=false
  * Example:
  * Note: whether start compact memory.
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
@@ -543,7 +543,7 @@ PyObject* eager_api_run_custom_op(PyObject* self,
                                   PyObject* kwargs) {
   EAGER_TRY
   FLAGS_tensor_operants_mode = "phi";
-  bool old_flag = FLAGS_enable_compact_mem;
+  bool compact_flag_bak = FLAGS_enable_compact_mem;
   FLAGS_enable_compact_mem = false;
   if (paddle::OperantsManager::Instance().phi_operants.get() == nullptr) {
     paddle::OperantsManager::Instance().phi_operants =
@@ -881,7 +881,7 @@ PyObject* eager_api_run_custom_op(PyObject* self,
   if (FLAGS_check_cuda_error) [[unlikely]] {
     egr::CUDAErrorCheck("eager_api_run_custom_op " + op_type + " finish");
   }
-  FLAGS_enable_compact_mem = old_flag;
+  FLAGS_enable_compact_mem = compact_flag_bak;
   return ToPyObject(*ctx.AllMutableOutput());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
@@ -870,32 +870,45 @@ void CheckAndDoCompact(const std::vector<phi::MetaTensor*>& meta_tensors,
   auto NeedCompact = [&](const std::vector<phi::MetaTensor*>& meta_tensors) {
     if (max_reserved < FLAGS_max_reserved_threshold_in_gb << 30) return false;
     if (cur_allocated < FLAGS_cur_allocated_threshold_in_gb << 30) return false;
-    const auto [max_free_size, total_free_size] =
+    const auto [max_free_size, large_N_free_size] =
         paddle::memory::VmmMaxFreeSize(phi::GPUPlace(current_device_id),
                                        meta_tensors.size());
     const auto& [req_total_size, size_vec] = CalTensorSize(meta_tensors);
+    VLOG(10) << "run api: " << api << "req_total_size: " << req_total_size
+             << ", max_free_size: " << max_free_size
+             << ", large_N_free_size: " << large_N_free_size
+             << ", max_reserved: " << max_reserved
+             << ", max_allocated: " << max_allocated
+             << ", cur_allocated: " << cur_allocated;
     if (req_total_size < max_free_size) return false;
-    if (req_total_size > total_free_size) {
+    if (req_total_size > large_N_free_size) {
       VLOG(1) << "Need Compact req_total_size: " << req_total_size
-              << ", total_free_size: " << total_free_size
-              << ", max_free_size: " << max_free_size;
+              << ", large_N_free_size: " << large_N_free_size
+              << ", max_free_size: " << max_free_size
+              << ", max_reserved: " << max_reserved
+              << ", max_allocated: " << max_allocated
+              << ", cur_allocated: " << cur_allocated;
       return true;
     }
     if (FLAGS_try_allocate) {
       auto alloc_succ = paddle::memory::TryAllocBatch(
           phi::GPUPlace(current_device_id), size_vec);
-      VLOG(1) << "TryAllocBatch ret: " << !alloc_succ
+      VLOG(1) << "TryAllocBatch ret: " << alloc_succ
               << ", req_total_size: " << req_total_size
-              << ", total_free_size: " << total_free_size
-              << ", max_free_size: " << max_free_size;
+              << ", large_N_free_size: " << large_N_free_size
+              << ", max_free_size: " << max_free_size
+              << ", max_reserved: " << max_reserved
+              << ", max_allocated: " << max_allocated
+              << ", cur_allocated: " << cur_allocated;
       return !alloc_succ;
     }
     return false;
   };
 
   if (NeedCompact(meta_tensors)) {
     VLOG(1) << "Before Compact max_reserved: " << max_reserved / divisor
-            << ", max_allocated: " << max_allocated / divisor;
+            << "GB, max_allocated: " << max_allocated / divisor
+            << "GB, cur_allocated: " << cur_allocated / divisor << "GB";
     paddle::memory::Compact(phi::GPUPlace(current_device_id));
   }
 #endif
diff --git a/paddle/phi/core/memory/allocation/allocator_facade.cc b/paddle/phi/core/memory/allocation/allocator_facade.cc
@@ -1013,11 +1013,29 @@ class AllocatorFacadePrivate {
       val = 0;
     }
 
-    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
+    if (val > 0 && FLAGS_use_virtual_memory_auto_growth &&
+        !FLAGS_use_multi_scale_virtual_memory_auto_growth) {
       auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
       cuda_allocators_[p][stream] =
           std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
               cuda_allocator, platform::GpuMinChunkSize(), p);
+    } else if (val > 0 && FLAGS_use_multi_scale_virtual_memory_auto_growth) {
+      std::cout << "enter init branch" << std::endl;
+      auto cuda_allocator_small = std::make_shared<CUDAVirtualMemAllocator>(p);
+      auto cuda_allocator_large = std::make_shared<CUDAVirtualMemAllocator>(p);
+      auto vmm_allocator_small =
+          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
+              cuda_allocator_small, platform::GpuMinChunkSize(), p);
+      auto vmm_allocator_large =
+          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
+              cuda_allocator_large, platform::GpuMinChunkSize(), p);
+
+      cuda_allocators_[p][stream] = std::make_shared<
+          VirtualMemoryAutoGrowthBestFitMultiScalePoolAllocator>(
+          vmm_allocator_small,
+          vmm_allocator_large,
+          platform::GpuMinChunkSize(),
+          p);
     } else {
       auto cuda_allocator = CreateCUDAAllocator(p);
       if (FLAGS_use_auto_growth_v2) {
diff --git a/paddle/phi/core/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/phi/core/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
@@ -363,10 +363,11 @@ bool VirtualMemoryAutoGrowthBestFitAllocator::TryAllocateBatch(
 
   std::lock_guard<SpinLock> guard(spinlock_);
 
-  // copy free_blocks_ to shadow_blocks_
+  // copy large N free_blocks_ to shadow_blocks_.
   std::map<std::pair<size_t, void *>, size_t> shadow_blocks;
-  for (const auto &pair : free_blocks_) {
-    shadow_blocks.emplace(pair.first, pair.first.first);
+  auto it = free_blocks_.rbegin();
+  for (int i = 0; i < sizes.size() && it != free_blocks_.rend(); ++i, ++it) {
+    shadow_blocks.emplace(it->first, it->first.first);
   }
   for (size_t size : sizes) {
     size_t aligned_size = AlignedSize(size, alignment_);
diff --git a/paddle/phi/core/memory/mem_visitor.cc b/paddle/phi/core/memory/mem_visitor.cc
@@ -54,23 +54,33 @@ void AllocatorVisitor::Visit(
   allocator->GetLargeAllocator()->Accept(this);
 }
 
+void AllocatorComputeStreamVisitor::Visit(StreamSafeCUDAAllocator* allocator) {
+  const std::vector<StreamSafeCUDAAllocator*>& allocators =
+      allocator->GetAllocatorByPlace();
+  assert(!allocators.empty());
+  // NOTE(liujinnan): Currently, the Allocator initialization sequence is as
+  // follows: the compute stream Allocator is initialized at program startup,
+  // and then, when multiple streams are encountered at runtime, additional
+  // Allocators are created and added to the end of the `allocator_map_` in
+  // `StreamSafeCUDAAllocator`. Therefore, we can use the first allocator in
+  // `allocator_map_` as the compute stream allocator. Although this approach is
+  // somewhat ugly and may not be robust, it is currently effective.
+  allocators[0]->GetUnderLyingAllocator()->Accept(this);
+}
+
 void FreeMemoryMetricsVisitor::Visit(
     VirtualMemoryAutoGrowthBestFitAllocator* allocator) {
   auto [large_size, sum_size] =
       allocator->SumLargestFreeBlockSizes(nums_blocks_);
   large_size_ = std::max(large_size_, large_size);
   sum_size_ = std::max(sum_size_, sum_size);
-  VLOG(1) << "Visit VirtualMemoryAutoGrowthBestFitAllocator large_free_size:"
-          << large_size_ << " sum_free_size:" << sum_size_;
 }
 
 void TryAllocVisitor::Visit(
     VirtualMemoryAutoGrowthBestFitAllocator* allocator) {
   // TODO(liujinnan): More detailed handling of multi-stream and MultiScalePool
   // scenarios.
   is_try_alloc_success_ |= allocator->TryAllocateBatch(sizes_);
-  VLOG(1) << "Visit VirtualMemoryAutoGrowthBestFitAllocator try_alloc_result:"
-          << is_try_alloc_success_;
 }
 
 void VMMFreeBlocksInfoVisitor::Visit(
diff --git a/paddle/phi/core/memory/mem_visitor.h b/paddle/phi/core/memory/mem_visitor.h
@@ -83,6 +83,16 @@ class AllocatorVisitor : public AllocatorVisitorReqImpl {
 };
 
 #ifdef PADDLE_WITH_CUDA
+/**
+ * @brief AllocatorComputeStreamVisitor is a Concrete Visitor class designed to
+ * only visit compute stream allocators.
+ */
+class AllocatorComputeStreamVisitor : public AllocatorVisitor {
+ public:
+  using AllocatorVisitor::Visit;
+  void Visit(StreamSafeCUDAAllocator* allocator) override;
+};
+
 /**
  * @brief FreeMemoryMetricsVisitor is a Concrete Visitor class designed to
  * inspect allocators for free memory information.
@@ -92,8 +102,9 @@ class AllocatorVisitor : public AllocatorVisitorReqImpl {
  * it provides specialized logic for the
  * VirtualMemoryAutoGrowthBestFitAllocator.
  */
-class FreeMemoryMetricsVisitor : public AllocatorVisitor {
+class FreeMemoryMetricsVisitor : public AllocatorComputeStreamVisitor {
  public:
+  using AllocatorComputeStreamVisitor::Visit;
   /**
    * @brief Constructor for FreeMemoryMetricsVisitor.
    * @param nums_blocks The number of largest free blocks to potentially track
@@ -139,7 +150,9 @@ class FreeMemoryMetricsVisitor : public AllocatorVisitor {
  * (typically VirtualMemoryAutoGrowthBestFitAllocator) and record if all
  * attempts were successful.
  */
-class TryAllocVisitor : public AllocatorVisitor {
+class TryAllocVisitor : public AllocatorComputeStreamVisitor {
+  using AllocatorComputeStreamVisitor::Visit;
+
  public:
   /**
    * @brief Constructor.
@@ -183,13 +196,10 @@ class TryAllocVisitor : public AllocatorVisitor {
  * internal state (the list of free memory blocks) and extract key information
  * (size and address) for external analysis or debugging.
  */
-class VMMFreeBlocksInfoVisitor : public AllocatorVisitor {
- public:
-  /**
-   * @brief Default Constructor.
-   */
-  VMMFreeBlocksInfoVisitor() {}
+class VMMFreeBlocksInfoVisitor : public AllocatorComputeStreamVisitor {
+  using AllocatorComputeStreamVisitor::Visit;
 
+ public:
   /**
    * @brief Retrieves the collected information about the free memory blocks.
    *
diff --git a/test/cpp/phi/memory/gen_compact_test.cc b/test/cpp/phi/memory/gen_compact_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ class CheckAndDoCompactTest : public ::testing::Test {
     FLAGS_try_allocate = true;
     FLAGS_use_multi_scale_virtual_memory_auto_growth = true;
     FLAGS_vmm_small_pool_size_in_mb = 2;
-    FLAGS_v = 4;
+    FLAGS_v = 10;
   }
 
   void TearDown() override { meta_tensors_.clear(); }
diff --git a/test/legacy_test/test_multi_scale_pool_allocator.py b/test/legacy_test/test_multi_scale_pool_allocator.py
@@ -63,6 +63,11 @@ def allocate_cmds(self, cmds):
             print(
                 f"reserved = {paddle_reserved2} allocated = {paddle_allocated2} auto growth = {paddle_reserved2 - paddle_reserved1} max_allocated = {paddle_max_allocated} max_reserved = {paddle_max_reserved}"
             )
+        # for multi stream
+        stream = paddle.device.cuda.Stream()
+        with paddle.device.cuda.stream_guard(stream):
+            x = paddle.empty([int(1 * 1024 * 1024 * 1024)], dtype=paddle.uint8)
+            del x
         return params
 
     def test_multi_scale_alloc_free(self):