From 0f8a06c6f13a192bf4e5b566e4121ba3b4f05d0e Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Wed, 16 Apr 2025 13:53:22 -0700 Subject: [PATCH 1/2] [flang][cuda][rt] Track asynchronous allocation stream for deallocation --- flang-rt/lib/cuda/allocator.cpp | 114 +++++++++++++++++- .../unittests/Runtime/CUDA/Allocatable.cpp | 59 +++++++++ 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp index a1c3a2c1b2ea8..6b85aa0efbb81 100644 --- a/flang-rt/lib/cuda/allocator.cpp +++ b/flang-rt/lib/cuda/allocator.cpp @@ -11,6 +11,7 @@ #include "flang-rt/runtime/derived.h" #include "flang-rt/runtime/descriptor.h" #include "flang-rt/runtime/environment.h" +#include "flang-rt/runtime/lock.h" #include "flang-rt/runtime/stat.h" #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/type-info.h" @@ -21,6 +22,106 @@ #include "cuda_runtime.h" namespace Fortran::runtime::cuda { + +struct DeviceAllocation { + void *ptr; + std::size_t size; + cudaStream_t stream; +}; + +// Compare address values. nullptr will be sorted at the end of the array. +int compareDeviceAlloc(const void *a, const void *b) { + const DeviceAllocation *deva = (const DeviceAllocation *)a; + const DeviceAllocation *devb = (const DeviceAllocation *)b; + if (deva->ptr == nullptr && devb->ptr == nullptr) + return 0; + if (deva->ptr == nullptr) + return 1; + if (devb->ptr == nullptr) + return -1; + return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0); +} + +// Dynamic array for tracking asynchronous allocations. +static DeviceAllocation *deviceAllocations = nullptr; +Lock lock; +static int maxDeviceAllocations{512}; // Initial size +static int numDeviceAllocations{0}; +static constexpr int allocNotFound{-1}; + +static void initAllocations() { + if (!deviceAllocations) { + deviceAllocations = static_cast( + malloc(maxDeviceAllocations * sizeof(DeviceAllocation))); + if (!deviceAllocations) { + Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Failed to allocate tracking array"); + } + } +} + +// Double the size of the allocation array when size if +static void doubleAllocationArray() { + unsigned newSize = maxDeviceAllocations * 2; + DeviceAllocation *newArray = static_cast( + realloc(deviceAllocations, newSize * sizeof(DeviceAllocation))); + if (!newArray) { + Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Failed to reallocate tracking array"); + } + deviceAllocations = newArray; + maxDeviceAllocations = newSize; +} + +static unsigned findAllocation(void *ptr) { + if (numDeviceAllocations == 0) { + return allocNotFound; + } + + int left{0}; + int right{numDeviceAllocations - 1}; + + if (left == right) { + return left; + } + + while (left <= right) { + int mid = left + (right - left) / 2; + if (deviceAllocations[mid].ptr == ptr) { + return mid; + } + if (deviceAllocations[mid].ptr < ptr) { + left = mid + 1; + } else { + right = mid - 1; + } + } + return allocNotFound; +} + +static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) { + CriticalSection critical{lock}; + initAllocations(); + if (numDeviceAllocations >= maxDeviceAllocations) { + doubleAllocationArray(); + } + deviceAllocations[numDeviceAllocations].ptr = ptr; + deviceAllocations[numDeviceAllocations].size = size; + deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream; + ++numDeviceAllocations; + qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation), + compareDeviceAlloc); +} + +static void eraseAllocation(int pos) { + deviceAllocations[pos].ptr = nullptr; + deviceAllocations[pos].size = 0; + deviceAllocations[pos].stream = (cudaStream_t)0; + qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation), + compareDeviceAlloc); + --numDeviceAllocations; +} + extern "C" { void RTDEF(CUFRegisterAllocator)() { @@ -55,12 +156,23 @@ void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) { } else { CUDA_REPORT_IF_ERROR( cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId)); + insertAllocation(p, sizeInBytes, asyncId); } } return p; } -void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } +void CUFFreeDevice(void *p) { + CriticalSection critical{lock}; + int pos = findAllocation(p); + if (pos >= 0) { + cudaStream_t stream = deviceAllocations[pos].stream; + eraseAllocation(pos); + CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream)); + } else { + CUDA_REPORT_IF_ERROR(cudaFree(p)); + } +} void *CUFAllocManaged( std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) { diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp index 1c8ded0f87d4e..89649aa95ad93 100644 --- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp +++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp @@ -58,3 +58,62 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) { EXPECT_EQ(cudaSuccess, cudaGetLastError()); } + +TEST(AllocatableCUFTest, StreamDeviceAllocatable) { + using Fortran::common::TypeCategory; + RTNAME(CUFRegisterAllocator)(); + // REAL(4), DEVICE, ALLOCATABLE :: a(:) + auto a{createAllocatable(TypeCategory::Real, 4)}; + a->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx()); + EXPECT_FALSE(a->HasAddendum()); + RTNAME(AllocatableSetBounds)(*a, 0, 1, 10); + + auto b{createAllocatable(TypeCategory::Real, 4)}; + b->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, b->GetAllocIdx()); + EXPECT_FALSE(b->HasAddendum()); + RTNAME(AllocatableSetBounds)(*b, 0, 1, 20); + + auto c{createAllocatable(TypeCategory::Real, 4)}; + c->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, c->GetAllocIdx()); + EXPECT_FALSE(b->HasAddendum()); + RTNAME(AllocatableSetBounds)(*c, 0, 1, 100); + + RTNAME(AllocatableAllocate) + (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(a->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableAllocate) + (*b, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(b->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableAllocate) + (*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(c->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableDeallocate) + (*b, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(b->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableDeallocate) + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(a->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableDeallocate) + (*c, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(c->IsAllocated()); + cudaDeviceSynchronize(); + EXPECT_EQ(cudaSuccess, cudaGetLastError()); +} From f07cb3f2ce93dcd1f1363bffc337077a63d8995a Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Wed, 23 Apr 2025 15:40:09 -0700 Subject: [PATCH 2/2] Fix comment --- flang-rt/lib/cuda/allocator.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp index 6b85aa0efbb81..51119ab251168 100644 --- a/flang-rt/lib/cuda/allocator.cpp +++ b/flang-rt/lib/cuda/allocator.cpp @@ -60,7 +60,6 @@ static void initAllocations() { } } -// Double the size of the allocation array when size if static void doubleAllocationArray() { unsigned newSize = maxDeviceAllocations * 2; DeviceAllocation *newArray = static_cast(