1313#include < torch/csrc/inductor/aoti_package/model_package_loader.h>
1414#include < torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
1515#if defined(USE_CUDA)
16+ #include < c10/cuda/CUDACachingAllocator.h>
1617#include < cuda_runtime.h>
1718#endif
1819#if defined(USE_CUDA) || defined(USE_ROCM)
@@ -327,20 +328,26 @@ void test_aoti_double_buffering_with_tensor_constants() {
327328 ASSERT_TRUE (torch::allclose (ref_output_tensors[0 ], actual_output_tensors[0 ]));
328329}
329330
330- void test_aoti_free_buffer () {
331+ void test_aoti_free_buffer (bool use_runtime_constant_folding ) {
331332 torch::NoGradGuard no_grad;
333+ size_t allocated, reserved, active;
332334
333335 std::string data_path =
334336 (std::filesystem::path (
335337 STRINGIZE (CMAKE_CURRENT_BINARY_DIR)) / " large_data.pt" )
336338 .string ();
337339
338340 // Memory information variable
339- cudaError_t cudaStatus;
340341 size_t DATASIZE = 128 * 1024 * 1024 ; // We have 128MB of weight data.
342+ size_t FOLDEDDATASIZE = use_runtime_constant_folding
343+ ? 64 * 1024 * 1024
344+ : 0 ; // We have 64MB of folded data.
341345
342346 torch::jit::script::Module data_loader = torch::jit::load (data_path);
343347 std::string path_attr = " model_so_path" ;
348+ if (use_runtime_constant_folding) {
349+ path_attr += std::string (" _use_runtime_constant_folding" );
350+ }
344351 std::string inputs_attr = " inputs" ;
345352 std::string outputs_attr = " outputs" ;
346353 std::string weights_attr = " w_pre" ;
@@ -365,7 +372,16 @@ void test_aoti_free_buffer() {
365372 runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
366373 model_so_path);
367374
368- // We extract the initial memory here.
375+ // We extract the memory information starting from here.
376+ int device_idx = -1 ;
377+ cudaError_t cudaStatus;
378+ cudaStatus = cudaGetDevice (&device_idx);
379+ if (cudaStatus != cudaSuccess || device_idx == -1 ) {
380+ throw std::runtime_error (" cudaGetDevice failed!" );
381+ }
382+ c10::cuda::CUDACachingAllocator::DeviceStats stats =
383+ c10::cuda::CUDACachingAllocator::getDeviceStats (device_idx);
384+ // This should contain one set of weight (128MB) loaded from .so
369385 size_t initMemory = 0 ;
370386 size_t totalMemory = 0 ;
371387 cudaStatus = cudaMemGetInfo (&initMemory, &totalMemory);
@@ -382,42 +398,83 @@ void test_aoti_free_buffer() {
382398 }
383399 ASSERT_EQ (initMemory - DATASIZE, updateMemory2);
384400
401+ // Call run, this should run const_fold and create the folded constant in #2
402+ // (64MB).
403+ if (use_runtime_constant_folding) {
404+ runner->run_const_fold (/* use_inactive = */ true );
405+ size_t constFoldMemory = 0 ;
406+ cudaStatus = cudaMemGetInfo (&constFoldMemory, &totalMemory);
407+ if (cudaStatus != cudaSuccess) {
408+ throw std::runtime_error (" cudaMemGetInfo failed!" );
409+ }
410+ ASSERT_EQ (initMemory - DATASIZE - FOLDEDDATASIZE, constFoldMemory);
411+ }
412+
385413 // We swap and free the inactive buffer. (Use #2 and free #1)
414+ // Note that buffer #1 do not include folded-const
386415 runner->swap_constant_buffer ();
387416 runner->free_inactive_constant_buffer ();
388417 size_t postFreeMemory = 0 ;
389418 cudaStatus = cudaMemGetInfo (&postFreeMemory, &totalMemory);
390419 if (cudaStatus != cudaSuccess) {
391420 throw std::runtime_error (" cudaMemGetInfo failed!" );
392421 }
393- // We should only have one set of buffer (#2), memory used should equal
394- // initial memory.
395- ASSERT_EQ (initMemory, postFreeMemory);
422+ // We should only have one set of buffer (#2), available memory should equal
423+ // initial memory minus the folded constants .
424+ ASSERT_EQ (initMemory - FOLDEDDATASIZE , postFreeMemory);
396425
397- // We update random weights to buffer #1.
426+ // We update random weights to buffer #1 and run const fold.
427+ // We will have 2 full set of data plus 2 set of const-folded data.
398428 runner->update_inactive_constant_buffer (rand_map);
429+ runner->run_const_fold (/* use_inactive = */ true );
399430 size_t updateMemory1 = 0 ;
400431 cudaStatus = cudaMemGetInfo (&updateMemory1, &totalMemory);
401432 if (cudaStatus != cudaSuccess) {
402433 throw std::runtime_error (" cudaMemGetInfo failed!" );
403434 }
404- ASSERT_EQ (initMemory - DATASIZE, updateMemory1);
405-
406- // Test if we directly free the buffer #1.
435+ ASSERT_EQ (initMemory - DATASIZE - 2 * FOLDEDDATASIZE, updateMemory1);
436+
437+ // We directly free the buffer #1. This would free the DATASIZE weight.
438+ // If folded constant exists, it will not directly free the cudaMalloc, but
439+ // decrease the active buffer in CachingAllocator instead.
440+ size_t active1, active2;
441+ size_t allocated1, allocated2;
442+ stats = c10::cuda::CUDACachingAllocator::getDeviceStats (device_idx);
443+ active1 = stats.active_bytes [0 ].current ;
444+ allocated1 = stats.allocated_bytes [0 ].current ;
407445 runner->free_inactive_constant_buffer ();
408446 cudaStatus = cudaMemGetInfo (&updateMemory1, &totalMemory);
409447 if (cudaStatus != cudaSuccess) {
410448 throw std::runtime_error (" cudaMemGetInfo failed!" );
411449 }
412- ASSERT_EQ (initMemory, updateMemory1);
450+ stats = c10::cuda::CUDACachingAllocator::getDeviceStats (device_idx);
451+ active2 = stats.active_bytes [0 ].current ;
452+ allocated2 = stats.allocated_bytes [0 ].current ;
453+ ASSERT_EQ (initMemory - 2 * FOLDEDDATASIZE, updateMemory1);
454+ ASSERT_EQ (FOLDEDDATASIZE, active1 - active2);
413455
414456 // Free buffer #1 again, since #1 is freed, nothing should change.
415457 runner->free_inactive_constant_buffer ();
416458 cudaStatus = cudaMemGetInfo (&updateMemory1, &totalMemory);
417459 if (cudaStatus != cudaSuccess) {
418460 throw std::runtime_error (" cudaMemGetInfo failed!" );
419461 }
420- ASSERT_EQ (initMemory, updateMemory1);
462+ ASSERT_EQ (initMemory - 2 * FOLDEDDATASIZE, updateMemory1);
463+ ASSERT_EQ (FOLDEDDATASIZE, active1 - active2);
464+
465+ // Swap and free #2, no data should exist in memory now.
466+ // However, the folded constants still occupies the CUDA memory in
467+ // CachedAllocator.
468+ runner->swap_constant_buffer ();
469+ runner->free_inactive_constant_buffer ();
470+ stats = c10::cuda::CUDACachingAllocator::getDeviceStats (device_idx);
471+ active2 = stats.active_bytes [0 ].current ;
472+ cudaStatus = cudaMemGetInfo (&updateMemory1, &totalMemory);
473+ if (cudaStatus != cudaSuccess) {
474+ throw std::runtime_error (" cudaMemGetInfo failed!" );
475+ }
476+ ASSERT_EQ (initMemory + DATASIZE - 2 * FOLDEDDATASIZE, updateMemory1);
477+ ASSERT_EQ (2 * FOLDEDDATASIZE, active1 - active2);
421478}
422479
423480class ThreadPool {
@@ -612,7 +669,11 @@ TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
612669}
613670
614671TEST (AotInductorTest, FreeInactiveConstantBufferCuda) {
615- test_aoti_free_buffer ();
672+ test_aoti_free_buffer (false );
673+ }
674+
675+ TEST (AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
676+ test_aoti_free_buffer (true );
616677}
617678
618679TEST (AotInductorTest, MultiStreamTestCuda) {
0 commit comments