[AOTInductor] Free folded constants that's managed by AOTInductor (pytorch#149825)

muchulee8 · pytorchmergebot · commit e6afb51805d8 · 2025-03-27T06:05:50.000Z
internally. Summary: This diff allows freeing the usage of folded constants that's created by AOTInductor through CUDACachingAllocator instead of the constant blob from cudaMalloc directly. Test Plan: LD_LIBRARY_PATH=/data/users/$USER/pytorch/build/lib /home/$USER/local/pytorch/build/bin/test_aoti_inference Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: pytorch#149825 Approved by: https://github.com/chenyang78, https://github.com/desertfire, https://github.com/jingsh
diff --git a/test/cpp/aoti_inference/test.cpp b/test/cpp/aoti_inference/test.cpp
@@ -13,6 +13,7 @@
 #include <torch/csrc/inductor/aoti_package/model_package_loader.h>
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
 #if defined(USE_CUDA)
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <cuda_runtime.h>
 #endif
 #if defined(USE_CUDA) || defined(USE_ROCM)
@@ -327,20 +328,26 @@ void test_aoti_double_buffering_with_tensor_constants() {
   ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
 }
 
-void test_aoti_free_buffer() {
+void test_aoti_free_buffer(bool use_runtime_constant_folding) {
   torch::NoGradGuard no_grad;
+  size_t allocated, reserved, active;
 
   std::string data_path =
       (std::filesystem::path(
            STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "large_data.pt")
            .string();
 
   // Memory information variable
-  cudaError_t cudaStatus;
   size_t DATASIZE = 128 * 1024 * 1024; // We have 128MB of weight data.
+  size_t FOLDEDDATASIZE = use_runtime_constant_folding
+      ? 64 * 1024 * 1024
+      : 0; // We have 64MB of folded data.
 
   torch::jit::script::Module data_loader = torch::jit::load(data_path);
   std::string path_attr = "model_so_path";
+  if (use_runtime_constant_folding) {
+    path_attr += std::string("_use_runtime_constant_folding");
+  }
   std::string inputs_attr = "inputs";
   std::string outputs_attr = "outputs";
   std::string weights_attr = "w_pre";
@@ -365,7 +372,16 @@ void test_aoti_free_buffer() {
   runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
       model_so_path);
 
-  // We extract the initial memory here.
+  // We extract the memory information starting from here.
+  int device_idx = -1;
+  cudaError_t cudaStatus;
+  cudaStatus = cudaGetDevice(&device_idx);
+  if (cudaStatus != cudaSuccess || device_idx == -1) {
+    throw std::runtime_error("cudaGetDevice failed!");
+  }
+  c10::cuda::CUDACachingAllocator::DeviceStats stats =
+      c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  // This should contain one set of weight (128MB) loaded from .so
   size_t initMemory = 0;
   size_t totalMemory = 0;
   cudaStatus = cudaMemGetInfo(&initMemory, &totalMemory);
@@ -382,42 +398,83 @@ void test_aoti_free_buffer() {
   }
   ASSERT_EQ(initMemory - DATASIZE, updateMemory2);
 
+  // Call run, this should run const_fold and create the folded constant in #2
+  // (64MB).
+  if (use_runtime_constant_folding) {
+    runner->run_const_fold(/* use_inactive = */ true);
+    size_t constFoldMemory = 0;
+    cudaStatus = cudaMemGetInfo(&constFoldMemory, &totalMemory);
+    if (cudaStatus != cudaSuccess) {
+      throw std::runtime_error("cudaMemGetInfo failed!");
+    }
+    ASSERT_EQ(initMemory - DATASIZE - FOLDEDDATASIZE, constFoldMemory);
+  }
+
   // We swap and free the inactive buffer. (Use #2 and free #1)
+  // Note that buffer #1 do not include folded-const
   runner->swap_constant_buffer();
   runner->free_inactive_constant_buffer();
   size_t postFreeMemory = 0;
   cudaStatus = cudaMemGetInfo(&postFreeMemory, &totalMemory);
   if (cudaStatus != cudaSuccess) {
     throw std::runtime_error("cudaMemGetInfo failed!");
   }
-  // We should only have one set of buffer (#2), memory used should equal
-  // initial memory.
-  ASSERT_EQ(initMemory, postFreeMemory);
+  // We should only have one set of buffer (#2), available memory should equal
+  // initial memory minus the folded constants.
+  ASSERT_EQ(initMemory - FOLDEDDATASIZE, postFreeMemory);
 
-  // We update random weights to buffer #1.
+  // We update random weights to buffer #1 and run const fold.
+  // We will have 2 full set of data plus 2 set of const-folded data.
   runner->update_inactive_constant_buffer(rand_map);
+  runner->run_const_fold(/* use_inactive = */ true);
   size_t updateMemory1 = 0;
   cudaStatus = cudaMemGetInfo(&updateMemory1, &totalMemory);
   if (cudaStatus != cudaSuccess) {
     throw std::runtime_error("cudaMemGetInfo failed!");
   }
-  ASSERT_EQ(initMemory - DATASIZE, updateMemory1);
-
-  // Test if we directly free the buffer #1.
+  ASSERT_EQ(initMemory - DATASIZE - 2 * FOLDEDDATASIZE, updateMemory1);
+
+  // We directly free the buffer #1. This would free the DATASIZE weight.
+  // If folded constant exists, it will not directly free the cudaMalloc, but
+  // decrease the active buffer in CachingAllocator instead.
+  size_t active1, active2;
+  size_t allocated1, allocated2;
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  active1 = stats.active_bytes[0].current;
+  allocated1 = stats.allocated_bytes[0].current;
   runner->free_inactive_constant_buffer();
   cudaStatus = cudaMemGetInfo(&updateMemory1, &totalMemory);
   if (cudaStatus != cudaSuccess) {
     throw std::runtime_error("cudaMemGetInfo failed!");
   }
-  ASSERT_EQ(initMemory, updateMemory1);
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  active2 = stats.active_bytes[0].current;
+  allocated2 = stats.allocated_bytes[0].current;
+  ASSERT_EQ(initMemory - 2 * FOLDEDDATASIZE, updateMemory1);
+  ASSERT_EQ(FOLDEDDATASIZE, active1 - active2);
 
   // Free buffer #1 again, since #1 is freed, nothing should change.
   runner->free_inactive_constant_buffer();
   cudaStatus = cudaMemGetInfo(&updateMemory1, &totalMemory);
   if (cudaStatus != cudaSuccess) {
     throw std::runtime_error("cudaMemGetInfo failed!");
   }
-  ASSERT_EQ(initMemory, updateMemory1);
+  ASSERT_EQ(initMemory - 2 * FOLDEDDATASIZE, updateMemory1);
+  ASSERT_EQ(FOLDEDDATASIZE, active1 - active2);
+
+  // Swap and free #2, no data should exist in memory now.
+  // However, the folded constants still occupies the CUDA memory in
+  // CachedAllocator.
+  runner->swap_constant_buffer();
+  runner->free_inactive_constant_buffer();
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  active2 = stats.active_bytes[0].current;
+  cudaStatus = cudaMemGetInfo(&updateMemory1, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  ASSERT_EQ(initMemory + DATASIZE - 2 * FOLDEDDATASIZE, updateMemory1);
+  ASSERT_EQ(2 * FOLDEDDATASIZE, active1 - active2);
 }
 
 class ThreadPool {
@@ -612,7 +669,11 @@ TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
 }
 
 TEST(AotInductorTest, FreeInactiveConstantBufferCuda) {
-  test_aoti_free_buffer();
+  test_aoti_free_buffer(false);
+}
+
+TEST(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
+  test_aoti_free_buffer(true);
 }
 
 TEST(AotInductorTest, MultiStreamTestCuda) {
diff --git a/test/cpp/aoti_inference/test.py b/test/cpp/aoti_inference/test.py
@@ -93,29 +93,37 @@ def generate_large_tests():
         ref_output = model(x)
 
     torch._dynamo.reset()
-    with torch.no_grad():
-        model_so_path = aot_compile(
-            model,
-            (x,),
-        )
-        # Also store a .pt2 file using the aoti_compile_and_package API
-        pt2_package_path = torch._inductor.aoti_compile_and_package(
-            torch.export.export(
+    for use_runtime_constant_folding in [True, False]:
+        with torch.no_grad():
+            model_so_path = aot_compile(
                 model,
                 (x,),
-            ),
-        )
+                options={
+                    "aot_inductor.use_runtime_constant_folding": use_runtime_constant_folding
+                },
+            )
+            # Also store a .pt2 file using the aoti_compile_and_package API
+            pt2_package_path = torch._inductor.aoti_compile_and_package(
+                torch.export.export(
+                    model,
+                    (x,),
+                ),
+                inductor_configs={
+                    "aot_inductor.use_runtime_constant_folding": use_runtime_constant_folding
+                },
+            )
 
-    large_data.update(
-        {  # noqa: F541
-            "model_so_path": model_so_path,
-            "pt2_package_path": pt2_package_path,
-            "inputs": [x],
-            "outputs": [ref_output],
-            "w_pre": model.w_pre,
-            "w_add": model.w_add,
-        }
-    )
+        suffix = "_use_runtime_constant_folding" if use_runtime_constant_folding else ""
+        large_data.update(
+            {  # noqa: F541
+                f"model_so_path{suffix}": model_so_path,
+                f"pt2_package_path{suffix}": pt2_package_path,
+                "inputs": [x],
+                "outputs": [ref_output],
+                "w_pre": model.w_pre,
+                "w_add": model.w_add,
+            }
+        )
 
 
 # AOTI model which will create additional tensors during autograd.
diff --git a/torch/csrc/inductor/aoti_runtime/model.h b/torch/csrc/inductor/aoti_runtime/model.h
@@ -290,9 +290,11 @@ class AOTInductorModelBase {
 
   void load_constants() {
     size_t num_constants = this->num_constants();
+    size_t num_folded_constants = this->num_folded_constants();
     constants_map_->reserve(num_constants);
 
-    std::vector<size_t> constants_internal_offset(num_constants);
+    std::vector<size_t> constants_internal_offset(
+        num_constants - num_folded_constants);
     size_t blob_size = 0;
     compute_constant_blob(blob_size, constants_internal_offset);
 #if defined(USE_CUDA) || defined(USE_XPU)
@@ -317,7 +319,7 @@ class AOTInductorModelBase {
                 constants_internal_offset[i],
                 bytes_read,
                 data_size,
-                from_folded)
+                /* skip_copy = */ false)
           : nullptr;
       bytes_read += data_size;
 
@@ -401,13 +403,17 @@ class AOTInductorModelBase {
       std::vector<size_t>& constants_internal_offset) {
     size_t num_constants = this->num_constants();
     blob_size = 0;
+    size_t curr_idx = 0;
     for (size_t i = 0; i < num_constants; i++) {
+      if (this->constant_from_folded(i)) {
+        continue;
+      }
       size_t data_size = this->constant_data_size(i);
       if (data_size % AOTI_CONST_ALIGNMENT) {
         data_size = AOTI_CONST_ALIGNMENT +
             (data_size / AOTI_CONST_ALIGNMENT) * AOTI_CONST_ALIGNMENT;
       }
-      constants_internal_offset[i] = blob_size;
+      constants_internal_offset[curr_idx++] = blob_size;
       blob_size += data_size;
     }
   }
@@ -424,6 +430,17 @@ class AOTInductorModelBase {
     return constants_info_.size();
   }
 
+  size_t num_folded_constants() const {
+    size_t total_consts = this->num_constants();
+    size_t folded_consts = 0;
+    for (size_t i = 0; i < total_consts; i++) {
+      if (this->constant_from_folded(i)) {
+        folded_consts++;
+      }
+    }
+    return folded_consts;
+  }
+
   const char* input_name(int64_t idx) const {
     return inputs_info_.at(idx).name;
   }
diff --git a/torch/csrc/inductor/aoti_runtime/model_container.h b/torch/csrc/inductor/aoti_runtime/model_container.h
@@ -53,7 +53,8 @@ class AOTInductorModelContainer {
     }
     model->load_constants();
     constant_blob_ = model->release_constant_blob();
-    constants_internal_offset_.resize(model->num_constants());
+    constants_internal_offset_.resize(
+        model->num_constants() - model->num_folded_constants());
     model->compute_constant_blob(blob_size_, constants_internal_offset_);
 
     for (auto& model : models_) {
@@ -453,6 +454,19 @@ class AOTInductorModelContainer {
     } else {
       constant_blob_secondary_.reset();
     }
+    // Free the internally held constants
+    int num_constants = static_cast<int>(models_[0]->num_constants());
+    std::shared_ptr<ConstantMap> to_free_map =
+        use_secondary_ ? constants_map_ : constants_map_secondary_;
+
+    for (int i = 0; i < num_constants; i++) {
+      if (models_[0]->constant_from_folded(i)) {
+        auto it = to_free_map->find(models_[0]->constant_name(i));
+        if (it != to_free_map->end()) {
+          it->second.reset();
+        }
+      }
+    }
   }
 
   size_t num_inputs() const {