Skip to content

Commit 0f8a06c

Browse files
committed
[flang][cuda][rt] Track asynchronous allocation stream for deallocation
1 parent 98eb476 commit 0f8a06c

File tree

2 files changed

+172
-1
lines changed

2 files changed

+172
-1
lines changed

flang-rt/lib/cuda/allocator.cpp

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "flang-rt/runtime/derived.h"
1212
#include "flang-rt/runtime/descriptor.h"
1313
#include "flang-rt/runtime/environment.h"
14+
#include "flang-rt/runtime/lock.h"
1415
#include "flang-rt/runtime/stat.h"
1516
#include "flang-rt/runtime/terminator.h"
1617
#include "flang-rt/runtime/type-info.h"
@@ -21,6 +22,106 @@
2122
#include "cuda_runtime.h"
2223

2324
namespace Fortran::runtime::cuda {
25+
26+
struct DeviceAllocation {
27+
void *ptr;
28+
std::size_t size;
29+
cudaStream_t stream;
30+
};
31+
32+
// Compare address values. nullptr will be sorted at the end of the array.
33+
int compareDeviceAlloc(const void *a, const void *b) {
34+
const DeviceAllocation *deva = (const DeviceAllocation *)a;
35+
const DeviceAllocation *devb = (const DeviceAllocation *)b;
36+
if (deva->ptr == nullptr && devb->ptr == nullptr)
37+
return 0;
38+
if (deva->ptr == nullptr)
39+
return 1;
40+
if (devb->ptr == nullptr)
41+
return -1;
42+
return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
43+
}
44+
45+
// Dynamic array for tracking asynchronous allocations.
46+
static DeviceAllocation *deviceAllocations = nullptr;
47+
Lock lock;
48+
static int maxDeviceAllocations{512}; // Initial size
49+
static int numDeviceAllocations{0};
50+
static constexpr int allocNotFound{-1};
51+
52+
static void initAllocations() {
53+
if (!deviceAllocations) {
54+
deviceAllocations = static_cast<DeviceAllocation *>(
55+
malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
56+
if (!deviceAllocations) {
57+
Terminator terminator{__FILE__, __LINE__};
58+
terminator.Crash("Failed to allocate tracking array");
59+
}
60+
}
61+
}
62+
63+
// Double the size of the allocation array when size if
64+
static void doubleAllocationArray() {
65+
unsigned newSize = maxDeviceAllocations * 2;
66+
DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
67+
realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
68+
if (!newArray) {
69+
Terminator terminator{__FILE__, __LINE__};
70+
terminator.Crash("Failed to reallocate tracking array");
71+
}
72+
deviceAllocations = newArray;
73+
maxDeviceAllocations = newSize;
74+
}
75+
76+
static unsigned findAllocation(void *ptr) {
77+
if (numDeviceAllocations == 0) {
78+
return allocNotFound;
79+
}
80+
81+
int left{0};
82+
int right{numDeviceAllocations - 1};
83+
84+
if (left == right) {
85+
return left;
86+
}
87+
88+
while (left <= right) {
89+
int mid = left + (right - left) / 2;
90+
if (deviceAllocations[mid].ptr == ptr) {
91+
return mid;
92+
}
93+
if (deviceAllocations[mid].ptr < ptr) {
94+
left = mid + 1;
95+
} else {
96+
right = mid - 1;
97+
}
98+
}
99+
return allocNotFound;
100+
}
101+
102+
static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
103+
CriticalSection critical{lock};
104+
initAllocations();
105+
if (numDeviceAllocations >= maxDeviceAllocations) {
106+
doubleAllocationArray();
107+
}
108+
deviceAllocations[numDeviceAllocations].ptr = ptr;
109+
deviceAllocations[numDeviceAllocations].size = size;
110+
deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
111+
++numDeviceAllocations;
112+
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
113+
compareDeviceAlloc);
114+
}
115+
116+
static void eraseAllocation(int pos) {
117+
deviceAllocations[pos].ptr = nullptr;
118+
deviceAllocations[pos].size = 0;
119+
deviceAllocations[pos].stream = (cudaStream_t)0;
120+
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
121+
compareDeviceAlloc);
122+
--numDeviceAllocations;
123+
}
124+
24125
extern "C" {
25126

26127
void RTDEF(CUFRegisterAllocator)() {
@@ -55,12 +156,23 @@ void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
55156
} else {
56157
CUDA_REPORT_IF_ERROR(
57158
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
159+
insertAllocation(p, sizeInBytes, asyncId);
58160
}
59161
}
60162
return p;
61163
}
62164

63-
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
165+
void CUFFreeDevice(void *p) {
166+
CriticalSection critical{lock};
167+
int pos = findAllocation(p);
168+
if (pos >= 0) {
169+
cudaStream_t stream = deviceAllocations[pos].stream;
170+
eraseAllocation(pos);
171+
CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
172+
} else {
173+
CUDA_REPORT_IF_ERROR(cudaFree(p));
174+
}
175+
}
64176

65177
void *CUFAllocManaged(
66178
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {

flang-rt/unittests/Runtime/CUDA/Allocatable.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,62 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
5858

5959
EXPECT_EQ(cudaSuccess, cudaGetLastError());
6060
}
61+
62+
TEST(AllocatableCUFTest, StreamDeviceAllocatable) {
63+
using Fortran::common::TypeCategory;
64+
RTNAME(CUFRegisterAllocator)();
65+
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
66+
auto a{createAllocatable(TypeCategory::Real, 4)};
67+
a->SetAllocIdx(kDeviceAllocatorPos);
68+
EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
69+
EXPECT_FALSE(a->HasAddendum());
70+
RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
71+
72+
auto b{createAllocatable(TypeCategory::Real, 4)};
73+
b->SetAllocIdx(kDeviceAllocatorPos);
74+
EXPECT_EQ((int)kDeviceAllocatorPos, b->GetAllocIdx());
75+
EXPECT_FALSE(b->HasAddendum());
76+
RTNAME(AllocatableSetBounds)(*b, 0, 1, 20);
77+
78+
auto c{createAllocatable(TypeCategory::Real, 4)};
79+
c->SetAllocIdx(kDeviceAllocatorPos);
80+
EXPECT_EQ((int)kDeviceAllocatorPos, c->GetAllocIdx());
81+
EXPECT_FALSE(b->HasAddendum());
82+
RTNAME(AllocatableSetBounds)(*c, 0, 1, 100);
83+
84+
RTNAME(AllocatableAllocate)
85+
(*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
86+
EXPECT_TRUE(a->IsAllocated());
87+
cudaDeviceSynchronize();
88+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
89+
90+
RTNAME(AllocatableAllocate)
91+
(*b, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
92+
EXPECT_TRUE(b->IsAllocated());
93+
cudaDeviceSynchronize();
94+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
95+
96+
RTNAME(AllocatableAllocate)
97+
(*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
98+
EXPECT_TRUE(c->IsAllocated());
99+
cudaDeviceSynchronize();
100+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
101+
102+
RTNAME(AllocatableDeallocate)
103+
(*b, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
104+
EXPECT_FALSE(b->IsAllocated());
105+
cudaDeviceSynchronize();
106+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
107+
108+
RTNAME(AllocatableDeallocate)
109+
(*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
110+
EXPECT_FALSE(a->IsAllocated());
111+
cudaDeviceSynchronize();
112+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
113+
114+
RTNAME(AllocatableDeallocate)
115+
(*c, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
116+
EXPECT_FALSE(c->IsAllocated());
117+
cudaDeviceSynchronize();
118+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
119+
}

0 commit comments

Comments
 (0)