Skip to content

Commit ec7366c

Browse files
[Offload] Add olKernelMaxGroupSize
This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal value of `1`. Co-Authored-By: Callum Fare <[email protected]>
1 parent cfe5975 commit ec7366c

File tree

10 files changed

+113
-1
lines changed

10 files changed

+113
-1
lines changed

offload/liboffload/API/Kernel.td

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88
//
9-
// This file contains Offload API definitions related to launching kernels
9+
// This file contains Offload API definitions related to kernels
1010
//
1111
//===----------------------------------------------------------------------===//
1212

@@ -42,3 +42,18 @@ def : Function {
4242
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
4343
];
4444
}
45+
46+
def : Function {
47+
let name = "olGetKernelMaxGroupSize";
48+
let desc = "Get the maximum block size needed to achieve maximum occupancy.";
49+
let details = [];
50+
let params = [
51+
Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
52+
Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
53+
Param<"size_t", "SharedMemory", "dynamic shared memory required", PARAM_IN>,
54+
Param<"size_t*", "GroupSize", "maximum block size", PARAM_OUT>
55+
];
56+
let returns = [
57+
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
58+
];
59+
}

offload/liboffload/src/OffloadImpl.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,23 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
696696
return olDestroy(Program);
697697
}
698698

699+
Error olGetKernelMaxGroupSize_impl(ol_device_handle_t Device,
700+
ol_symbol_handle_t Kernel,
701+
size_t DynamicMemSize, size_t *GroupSize) {
702+
if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
703+
return createOffloadError(ErrorCode::SYMBOL_KIND,
704+
"provided symbol is not a kernel");
705+
auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
706+
707+
auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
708+
if (auto Err = Res.takeError())
709+
return Err;
710+
711+
*GroupSize = *Res;
712+
713+
return Error::success();
714+
}
715+
699716
Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
700717
ol_symbol_handle_t Kernel, const void *ArgumentsData,
701718
size_t ArgumentsSize,

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,14 @@ struct AMDGPUKernelTy : public GenericKernelTy {
570570
KernelLaunchParamsTy LaunchParams,
571571
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
572572

573+
/// Return maximum block size for maximum occupancy
574+
///
575+
/// TODO: This needs to be implemented for amdgpu
576+
Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
577+
size_t DynamicMemSize) const override {
578+
return 1;
579+
}
580+
573581
/// Print more elaborate kernel launch info for AMDGPU
574582
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
575583
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,9 @@ struct GenericKernelTy {
388388
KernelLaunchParamsTy LaunchParams,
389389
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
390390

391+
virtual Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
392+
size_t DynamicMemSize) const = 0;
393+
391394
/// Get the kernel name.
392395
const char *getName() const { return Name.c_str(); }
393396

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
7272
DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
7373
DLWRAP(cuDevicePrimaryCtxRetain, 2)
7474
DLWRAP(cuModuleLoadDataEx, 5)
75+
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
7576

7677
DLWRAP(cuDeviceCanAccessPeer, 3)
7778
DLWRAP(cuCtxEnablePeerAccess, 2)

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
290290
static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
291291

292292
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
293+
typedef size_t (*CUoccupancyB2DSize)(int);
293294

294295
CUresult cuCtxGetDevice(CUdevice *);
295296
CUresult cuDeviceGet(CUdevice *, int);
@@ -372,5 +373,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
372373
CUresult cuMemGetAllocationGranularity(size_t *granularity,
373374
const CUmemAllocationProp *prop,
374375
CUmemAllocationGranularity_flags option);
376+
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
377+
CUoccupancyB2DSize, size_t, int);
375378

376379
#endif

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
157157
KernelLaunchParamsTy LaunchParams,
158158
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
159159

160+
/// Return maximum block size for maximum occupancy
161+
Expected<size_t> maxGroupSize(GenericDeviceTy &,
162+
size_t DynamicMemSize) const override {
163+
int minGridSize;
164+
int maxBlockSize;
165+
auto Res = cuOccupancyMaxPotentialBlockSize(
166+
&minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
167+
if (auto Err = Plugin::check(
168+
Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
169+
return Err;
170+
}
171+
return maxBlockSize;
172+
}
173+
160174
private:
161175
/// The CUDA kernel function to execute.
162176
CUfunction Func;

offload/plugins-nextgen/host/src/rtl.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,13 @@ struct GenELF64KernelTy : public GenericKernelTy {
114114
return Plugin::success();
115115
}
116116

117+
/// Return maximum block size for maximum occupancy
118+
Expected<size_t> maxGroupSize(GenericDeviceTy &Device,
119+
size_t DynamicMemSize) const override {
120+
// TODO
121+
return 1;
122+
}
123+
117124
private:
118125
/// The kernel function to execute.
119126
void (*Func)(void);

offload/unittests/OffloadAPI/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ add_offload_unittest("init"
2020
target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
2121

2222
add_offload_unittest("kernel"
23+
kernel/olGetKernelMaxGroupSize.cpp
2324
kernel/olLaunchKernel.cpp)
2425

2526
add_offload_unittest("memory"
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//===------- Offload API tests - olGetKernelMaxGroupSize ------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "../common/Fixtures.hpp"
10+
#include <OffloadAPI.h>
11+
#include <gtest/gtest.h>
12+
13+
using olKernelGetMaxGroupSizeTest = OffloadKernelTest;
14+
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olKernelGetMaxGroupSizeTest);
15+
16+
TEST_P(olKernelGetMaxGroupSizeTest, Success) {
17+
size_t Size{0};
18+
ASSERT_SUCCESS(olGetKernelMaxGroupSize(Device, Kernel, 0, &Size));
19+
ASSERT_GT(Size, 0u);
20+
}
21+
22+
TEST_P(olKernelGetMaxGroupSizeTest, SuccessMem) {
23+
size_t Size{0};
24+
ASSERT_SUCCESS(olGetKernelMaxGroupSize(Device, Kernel, 1024, &Size));
25+
ASSERT_GT(Size, 0u);
26+
}
27+
28+
TEST_P(olKernelGetMaxGroupSizeTest, NullKernel) {
29+
size_t Size;
30+
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
31+
olGetKernelMaxGroupSize(Device, nullptr, 0, &Size));
32+
}
33+
34+
TEST_P(olKernelGetMaxGroupSizeTest, NullDevice) {
35+
size_t Size;
36+
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
37+
olGetKernelMaxGroupSize(nullptr, Kernel, 0, &Size));
38+
}
39+
40+
TEST_P(olKernelGetMaxGroupSizeTest, NullOutput) {
41+
ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
42+
olGetKernelMaxGroupSize(Device, Kernel, 0, nullptr));
43+
}

0 commit comments

Comments
 (0)