Skip to content

Commit a4604d9

Browse files
Merge branch 'main' into steffen/make_ext_func_fail_unsupported
2 parents 198e3ac + ed1f8bf commit a4604d9

File tree

16 files changed

+864
-154
lines changed

16 files changed

+864
-154
lines changed

source/adapters/cuda/enqueue.cpp

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <cmath>
2020
#include <cuda.h>
21+
#include <ur/ur.hpp>
2122

2223
ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
2324
uint32_t NumEventsInWaitList,
@@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
140141
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
141142
const size_t *GlobalWorkSize, const uint32_t WorkDim,
142143
const size_t MaxThreadsPerBlock[3],
143-
ur_kernel_handle_t Kernel, uint32_t LocalSize) {
144+
ur_kernel_handle_t Kernel) {
144145
assert(ThreadsPerBlock != nullptr);
145146
assert(GlobalWorkSize != nullptr);
146147
assert(Kernel != nullptr);
147-
int MinGrid, MaxBlockSize;
148-
size_t MaxBlockDim[3];
149148

150149
// The below assumes a three dimensional range but this is not guaranteed by
151150
// UR.
@@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
154153
GlobalSizeNormalized[i] = GlobalWorkSize[i];
155154
}
156155

156+
size_t MaxBlockDim[3];
157+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
157158
MaxBlockDim[1] = Device->getMaxBlockDimY();
158159
MaxBlockDim[2] = Device->getMaxBlockDimZ();
159160

160-
UR_CHECK_ERROR(
161-
cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
162-
NULL, LocalSize, MaxThreadsPerBlock[0]));
163-
164-
ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
165-
ThreadsPerBlock[1] =
166-
std::min(GlobalSizeNormalized[1],
167-
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
168-
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
169-
ThreadsPerBlock[0] = std::min(
170-
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
171-
172-
static auto IsPowerOf2 = [](size_t Value) -> bool {
173-
return Value && !(Value & (Value - 1));
174-
};
175-
176-
// Find a local work group size that is a divisor of the global
177-
// work group size to produce uniform work groups.
178-
// Additionally, for best compute utilisation, the local size has
179-
// to be a power of two.
180-
while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
181-
!IsPowerOf2(ThreadsPerBlock[0])) {
182-
--ThreadsPerBlock[0];
183-
}
161+
int MinGrid, MaxBlockSize;
162+
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
163+
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
164+
MaxThreadsPerBlock[0]));
165+
166+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
167+
MaxBlockDim, MaxBlockSize);
184168
}
185169

186170
// Helper to verify out-of-registers case (exceeded block max registers).
@@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context,
261245
}
262246
} else {
263247
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
264-
MaxThreadsPerBlock, Kernel, LocalSize);
248+
MaxThreadsPerBlock, Kernel);
265249
}
266250
}
267251

source/adapters/hip/enqueue.cpp

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "memory.hpp"
1717
#include "queue.hpp"
1818

19+
#include <ur/ur.hpp>
20+
1921
extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
2022

2123
ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
@@ -48,23 +50,29 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
4850
}
4951
}
5052

51-
void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
52-
const size_t *GlobalWorkSize,
53-
const size_t MaxThreadsPerBlock[3],
54-
ur_kernel_handle_t Kernel) {
53+
// Determine local work sizes that result in uniform work groups.
54+
// The default threadsPerBlock only require handling the first work_dim
55+
// dimension.
56+
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
57+
const size_t *GlobalWorkSize, const uint32_t WorkDim,
58+
const size_t MaxThreadsPerBlock[3]) {
5559
assert(ThreadsPerBlock != nullptr);
5660
assert(GlobalWorkSize != nullptr);
57-
assert(Kernel != nullptr);
5861

59-
std::ignore = Kernel;
62+
// FIXME: The below assumes a three dimensional range but this is not
63+
// guaranteed by UR.
64+
size_t GlobalSizeNormalized[3] = {1, 1, 1};
65+
for (uint32_t i = 0; i < WorkDim; i++) {
66+
GlobalSizeNormalized[i] = GlobalWorkSize[i];
67+
}
6068

61-
ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
69+
size_t MaxBlockDim[3];
70+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
71+
MaxBlockDim[1] = Device->getMaxBlockDimY();
72+
MaxBlockDim[2] = Device->getMaxBlockDimZ();
6273

63-
// Find a local work group size that is a divisor of the global
64-
// work group size to produce uniform work groups.
65-
while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
66-
--ThreadsPerBlock[0];
67-
}
74+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
75+
MaxBlockDim, MaxThreadsPerBlock[0]);
6876
}
6977

7078
namespace {
@@ -1786,8 +1794,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
17861794
return err;
17871795
}
17881796
} else {
1789-
simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize,
1790-
MaxThreadsPerBlock, Kernel);
1797+
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
1798+
MaxThreadsPerBlock);
17911799
}
17921800
}
17931801

0 commit comments

Comments
 (0)