Skip to content

Commit f5609aa

Browse files
authored
[flang][cuda] Use a reference for asyncObject (#140614)
Switch from `int64_t` to `int64_t*` to fit with the rest of the implementation. New tentative with some fix. The previous was reverted some time ago. Reviewed in #138010
1 parent a04cff1 commit f5609aa

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+183
-184
lines changed

flang-rt/include/flang-rt/runtime/allocator-registry.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
namespace Fortran::runtime {
2121

22-
using AllocFct = void *(*)(std::size_t, std::int64_t);
22+
using AllocFct = void *(*)(std::size_t, std::int64_t *);
2323
using FreeFct = void (*)(void *);
2424

2525
typedef struct Allocator_t {
@@ -28,7 +28,7 @@ typedef struct Allocator_t {
2828
} Allocator_t;
2929

3030
static RT_API_ATTRS void *MallocWrapper(
31-
std::size_t size, [[maybe_unused]] std::int64_t) {
31+
std::size_t size, [[maybe_unused]] std::int64_t *) {
3232
return std::malloc(size);
3333
}
3434
#ifdef RT_DEVICE_COMPILATION

flang-rt/include/flang-rt/runtime/descriptor.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929
#include <cstdio>
3030
#include <cstring>
3131

32-
/// Value used for asyncId when no specific stream is specified.
33-
static constexpr std::int64_t kNoAsyncId = -1;
32+
/// Value used for asyncObject when no specific stream is specified.
33+
static constexpr std::int64_t *kNoAsyncObject = nullptr;
3434

3535
namespace Fortran::runtime {
3636

@@ -372,7 +372,7 @@ class Descriptor {
372372
// before calling. It (re)computes the byte strides after
373373
// allocation. Does not allocate automatic components or
374374
// perform default component initialization.
375-
RT_API_ATTRS int Allocate(std::int64_t asyncId);
375+
RT_API_ATTRS int Allocate(std::int64_t *asyncObject);
376376
RT_API_ATTRS void SetByteStrides();
377377

378378
// Deallocates storage; does not call FINAL subroutines or

flang-rt/include/flang-rt/runtime/reduction-templates.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
347347
// as the element size of the source.
348348
result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
349349
CFI_attribute_allocatable);
350-
if (int stat{result.Allocate(kNoAsyncId)}) {
350+
if (int stat{result.Allocate(kNoAsyncObject)}) {
351351
terminator.Crash(
352352
"%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
353353
}

flang-rt/lib/cuda/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ add_flangrt_library(flang_rt.cuda STATIC SHARED
1414
kernel.cpp
1515
memmove-function.cpp
1616
memory.cpp
17+
pointer.cpp
1718
registration.cpp
1819

1920
TARGET_PROPERTIES

flang-rt/lib/cuda/allocatable.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ namespace Fortran::runtime::cuda {
2323
extern "C" {
2424
RT_EXT_API_GROUP_BEGIN
2525

26-
int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
26+
int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
2727
bool *pinned, bool hasStat, const Descriptor *errMsg,
2828
const char *sourceFile, int sourceLine) {
2929
int stat{RTNAME(CUFAllocatableAllocate)(
@@ -41,7 +41,7 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
4141
return stat;
4242
}
4343

44-
int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
44+
int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
4545
bool *pinned, bool hasStat, const Descriptor *errMsg,
4646
const char *sourceFile, int sourceLine) {
4747
if (desc.HasAddendum()) {
@@ -63,7 +63,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
6363
}
6464

6565
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
66-
const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
66+
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
6767
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
6868
int stat{RTNAME(CUFAllocatableAllocate)(
6969
alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
@@ -76,7 +76,7 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
7676
}
7777

7878
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
79-
const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
79+
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
8080
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
8181
int stat{RTNAME(CUFAllocatableAllocateSync)(
8282
alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};

flang-rt/lib/cuda/allocator.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,15 @@ static unsigned findAllocation(void *ptr) {
9898
return allocNotFound;
9999
}
100100

101-
static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
101+
static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) {
102102
CriticalSection critical{lock};
103103
initAllocations();
104104
if (numDeviceAllocations >= maxDeviceAllocations) {
105105
doubleAllocationArray();
106106
}
107107
deviceAllocations[numDeviceAllocations].ptr = ptr;
108108
deviceAllocations[numDeviceAllocations].size = size;
109-
deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
109+
deviceAllocations[numDeviceAllocations].stream = stream;
110110
++numDeviceAllocations;
111111
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
112112
compareDeviceAlloc);
@@ -136,26 +136,26 @@ void RTDEF(CUFRegisterAllocator)() {
136136
}
137137

138138
void *CUFAllocPinned(
139-
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
139+
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
140140
void *p;
141141
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
142142
return p;
143143
}
144144

145145
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
146146

147-
void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
147+
void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
148148
void *p;
149149
if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
150150
CUDA_REPORT_IF_ERROR(
151151
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
152152
} else {
153-
if (asyncId == kNoAsyncId) {
153+
if (asyncObject == kNoAsyncObject) {
154154
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
155155
} else {
156156
CUDA_REPORT_IF_ERROR(
157-
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
158-
insertAllocation(p, sizeInBytes, asyncId);
157+
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
158+
insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
159159
}
160160
}
161161
return p;
@@ -174,7 +174,7 @@ void CUFFreeDevice(void *p) {
174174
}
175175

176176
void *CUFAllocManaged(
177-
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
177+
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
178178
void *p;
179179
CUDA_REPORT_IF_ERROR(
180180
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -184,9 +184,9 @@ void *CUFAllocManaged(
184184
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
185185

186186
void *CUFAllocUnified(
187-
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
187+
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
188188
// Call alloc managed for the time being.
189-
return CUFAllocManaged(sizeInBytes, asyncId);
189+
return CUFAllocManaged(sizeInBytes, asyncObject);
190190
}
191191

192192
void CUFFreeUnified(void *p) {

flang-rt/lib/cuda/descriptor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ RT_EXT_API_GROUP_BEGIN
2121
Descriptor *RTDEF(CUFAllocDescriptor)(
2222
std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
2323
return reinterpret_cast<Descriptor *>(
24-
CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
24+
CUFAllocManaged(sizeInBytes, /*asyncObject=*/nullptr));
2525
}
2626

2727
void RTDEF(CUFFreeDescriptor)(

flang-rt/lib/cuda/pointer.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ namespace Fortran::runtime::cuda {
2222
extern "C" {
2323
RT_EXT_API_GROUP_BEGIN
2424

25-
int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool *pinned,
25+
int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
2626
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
2727
int sourceLine) {
2828
if (desc.HasAddendum()) {
@@ -43,7 +43,7 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool *pinned,
4343
return stat;
4444
}
4545

46-
int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
46+
int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
4747
bool *pinned, bool hasStat, const Descriptor *errMsg,
4848
const char *sourceFile, int sourceLine) {
4949
int stat{RTNAME(CUFPointerAllocate)(
@@ -62,7 +62,7 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
6262
}
6363

6464
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
65-
const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
65+
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
6666
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
6767
int stat{RTNAME(CUFPointerAllocate)(
6868
pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
@@ -75,7 +75,7 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
7575
}
7676

7777
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
78-
const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
78+
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
7979
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
8080
int stat{RTNAME(CUFPointerAllocateSync)(
8181
pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};

flang-rt/lib/runtime/allocatable.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,17 +133,17 @@ void RTDEF(AllocatableApplyMold)(
133133
}
134134
}
135135

136-
int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId,
137-
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
138-
int sourceLine) {
136+
int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
137+
std::int64_t *asyncObject, bool hasStat, const Descriptor *errMsg,
138+
const char *sourceFile, int sourceLine) {
139139
Terminator terminator{sourceFile, sourceLine};
140140
if (!descriptor.IsAllocatable()) {
141141
return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
142142
} else if (descriptor.IsAllocated()) {
143143
return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat);
144144
} else {
145-
int stat{
146-
ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)};
145+
int stat{ReturnError(
146+
terminator, descriptor.Allocate(asyncObject), errMsg, hasStat)};
147147
if (stat == StatOk) {
148148
if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
149149
if (const auto *derived{addendum->derivedType()}) {
@@ -162,7 +162,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
162162
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
163163
const char *sourceFile, int sourceLine) {
164164
int stat{RTNAME(AllocatableAllocate)(
165-
alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)};
165+
alloc, /*asyncObject=*/nullptr, hasStat, errMsg, sourceFile, sourceLine)};
166166
if (stat == StatOk) {
167167
Terminator terminator{sourceFile, sourceLine};
168168
DoFromSourceAssign(alloc, source, terminator);

flang-rt/lib/runtime/array-constructor.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
5050
initialAllocationSize(fromElements, to.ElementBytes())};
5151
to.GetDimension(0).SetBounds(1, allocationSize);
5252
RTNAME(AllocatableAllocate)
53-
(to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
53+
(to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
5454
vector.sourceFile, vector.sourceLine);
5555
to.GetDimension(0).SetBounds(1, fromElements);
5656
vector.actualAllocationSize = allocationSize;
@@ -59,7 +59,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
5959
// first value: there should be no reallocation.
6060
RUNTIME_CHECK(terminator, previousToElements >= fromElements);
6161
RTNAME(AllocatableAllocate)
62-
(to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
62+
(to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
6363
vector.sourceFile, vector.sourceLine);
6464
vector.actualAllocationSize = previousToElements;
6565
}

0 commit comments

Comments
 (0)