Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
DLWRAP(cuDevicePrimaryCtxRetain, 2)
DLWRAP(cuModuleLoadDataEx, 5)
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
DLWRAP(cuFuncGetParamInfo, 4)

DLWRAP(cuDeviceCanAccessPeer, 3)
DLWRAP(cuCtxEnablePeerAccess, 2)
Expand Down
1 change: 1 addition & 0 deletions offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -390,5 +390,6 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);

#endif
33 changes: 31 additions & 2 deletions offload/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy {
// The maximum number of threads cannot exceed the maximum of the kernel.
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);

return Plugin::success();
// Retrieve the size of the arguments.
return initArgsSize();
}

/// Launch the CUDA kernel function.
Expand All @@ -173,11 +174,32 @@ struct CUDAKernelTy : public GenericKernelTy {
}

private:
/// Initialize the size of the arguments.
Error initArgsSize() {
CUresult Res;
size_t ArgOffset, ArgSize;
size_t Arg = 0;

ArgsSize = 0;

// Find the last argument to know the total size of the arguments.
while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
CUDA_SUCCESS)
ArgsSize = ArgOffset + ArgSize;

if (Res != CUDA_ERROR_INVALID_VALUE)
return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
return Plugin::success();
}

/// The CUDA kernel function to execute.
CUfunction Func;
/// The maximum amount of dynamic shared memory per thread group. By default,
/// this is set to 48 KB.
mutable uint32_t MaxDynCGroupMemLimit = 49152;

/// The size of the kernel arguments.
size_t ArgsSize;
};

/// Class wrapping a CUDA stream reference. These are the objects handled by the
Expand Down Expand Up @@ -1430,16 +1452,23 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);

// The args size passed in LaunchParams may have tail padding, which is not
// accepted by the CUDA driver.
if (ArgsSize > LaunchParams.Size)
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"mismatch in kernel arguments");

CUstream Stream;
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
return Err;

uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());

size_t ConfigArgsSize = ArgsSize;
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
reinterpret_cast<void *>(&LaunchParams.Size),
reinterpret_cast<void *>(&ConfigArgsSize),
CU_LAUNCH_PARAM_END};

// If we are running an RPC server we want to wake up the server thread
Expand Down
3 changes: 0 additions & 3 deletions offload/test/offloading/CUDA/basic_launch_multi_arg.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
// clang-format on

// REQUIRES: gpu
//
// FIXME: https://github.com/llvm/llvm-project/issues/161265
// UNSUPPORTED: gpu

#include <stdio.h>

Expand Down
Loading