Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion offload/liboffload/API/Kernel.td
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
// This file contains Offload API definitions related to launching kernels
// This file contains Offload API definitions related to kernels
//
//===----------------------------------------------------------------------===//

Expand Down Expand Up @@ -42,3 +42,21 @@ def : Function {
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
];
}

def : Function {
let name = "olCalculateOptimalOccupancy";
let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy.";
let details = [
"For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.",
];
let params = [
Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>,
Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT>
];
let returns = [
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>,
];
}
18 changes: 18 additions & 0 deletions offload/liboffload/src/OffloadImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
return olDestroy(Program);
}

Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
ol_symbol_handle_t Kernel,
size_t DynamicMemSize,
size_t *GroupSize) {
if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
return createOffloadError(ErrorCode::SYMBOL_KIND,
"provided symbol is not a kernel");
auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);

auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
if (auto Err = Res.takeError())
return Err;

*GroupSize = *Res;

return Error::success();
}

Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_symbol_handle_t Kernel, const void *ArgumentsData,
size_t ArgumentsSize,
Expand Down
10 changes: 10 additions & 0 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,16 @@ struct AMDGPUKernelTy : public GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

/// Return maximum block size for maximum occupancy
///
/// TODO: This needs to be implemented for amdgpu
Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
uint64_t DynamicMemSize) const override {
return Plugin::error(
ErrorCode::UNSUPPORTED,
"occupancy calculations for AMDGPU are not yet implemented");
}

/// Print more elaborate kernel launch info for AMDGPU
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
Expand Down
3 changes: 3 additions & 0 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,9 @@ struct GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;

virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
uint64_t DynamicMemSize) const = 0;

/// Get the kernel name.
const char *getName() const { return Name.c_str(); }

Expand Down
1 change: 1 addition & 0 deletions offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
DLWRAP(cuDevicePrimaryCtxRetain, 2)
DLWRAP(cuModuleLoadDataEx, 5)
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)

DLWRAP(cuDeviceCanAccessPeer, 3)
DLWRAP(cuCtxEnablePeerAccess, 2)
Expand Down
3 changes: 3 additions & 0 deletions offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;

typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
typedef size_t (*CUoccupancyB2DSize)(int);

CUresult cuCtxGetDevice(CUdevice *);
CUresult cuDeviceGet(CUdevice *, int);
Expand Down Expand Up @@ -372,5 +373,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
CUresult cuMemGetAllocationGranularity(size_t *granularity,
const CUmemAllocationProp *prop,
CUmemAllocationGranularity_flags option);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);

#endif
14 changes: 14 additions & 0 deletions offload/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

/// Return maximum block size for maximum occupancy
Expected<uint64_t> maxGroupSize(GenericDeviceTy &,
uint64_t DynamicMemSize) const override {
int minGridSize;
int maxBlockSize;
auto Res = cuOccupancyMaxPotentialBlockSize(
&minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
if (auto Err = Plugin::check(
Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
return Err;
}
return maxBlockSize;
}

private:
/// The CUDA kernel function to execute.
CUfunction Func;
Expand Down
8 changes: 8 additions & 0 deletions offload/plugins-nextgen/host/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy {
return Plugin::success();
}

/// Return maximum block size for maximum occupancy
Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device,
uint64_t DynamicMemSize) const override {
return Plugin::error(
ErrorCode::UNSUPPORTED,
"occupancy calculations are not implemented for the host device");
}

private:
/// The kernel function to execute.
void (*Func)(void);
Expand Down
1 change: 1 addition & 0 deletions offload/unittests/OffloadAPI/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ add_offload_unittest("init"
target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)

add_offload_unittest("kernel"
kernel/olCalculateOptimalOccupancy.cpp
kernel/olLaunchKernel.cpp)

add_offload_unittest("memory"
Expand Down
14 changes: 14 additions & 0 deletions offload/unittests/OffloadAPI/common/Fixtures.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,20 @@
} while (0)
#endif

#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED
#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL) \
do { \
ol_result_t Res = ACTUAL; \
if (Res && Res->Code == OL_ERRC_UNSUPPORTED) { \
GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test"; \
return; \
} else if (Res && Res->Code != OL_ERRC_SUCCESS) { \
GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": " \
<< Res->Details; \
} \
} while (0)
#endif

// TODO: rework this so the EXPECTED/ACTUAL results are readable
#ifndef ASSERT_ERROR
#define ASSERT_ERROR(EXPECTED, ACTUAL) \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//===------- Offload API tests - olCalculateOptimalOccupancy --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "../common/Fixtures.hpp"
#include <OffloadAPI.h>
#include <gtest/gtest.h>

using olCalculateOptimalOccupancyTest = OffloadKernelTest;
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olCalculateOptimalOccupancyTest);

TEST_P(olCalculateOptimalOccupancyTest, Success) {
size_t Size{0};
ASSERT_SUCCESS_OR_UNSUPPORTED(
olCalculateOptimalOccupancy(Device, Kernel, 0, &Size));
ASSERT_GT(Size, 0u);
}

TEST_P(olCalculateOptimalOccupancyTest, SuccessMem) {
size_t Size{0};
ASSERT_SUCCESS_OR_UNSUPPORTED(
olCalculateOptimalOccupancy(Device, Kernel, 1024, &Size));
ASSERT_GT(Size, 0u);
}

TEST_P(olCalculateOptimalOccupancyTest, NullKernel) {
size_t Size;
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
olCalculateOptimalOccupancy(Device, nullptr, 0, &Size));
}

TEST_P(olCalculateOptimalOccupancyTest, NullDevice) {
size_t Size;
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
olCalculateOptimalOccupancy(nullptr, Kernel, 0, &Size));
}

TEST_P(olCalculateOptimalOccupancyTest, NullOutput) {
ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
olCalculateOptimalOccupancy(Device, Kernel, 0, nullptr));
}
Loading