From f6c79b2e7d70f0483778d0706a538fc58ebe0fa6 Mon Sep 17 00:00:00 2001 From: Kevin Sala Date: Sun, 14 Dec 2025 20:26:29 -0800 Subject: [PATCH 1/4] [offload] Fix CUDA args size by subtracting tail padding --- .../cuda/dynamic_cuda/cuda.cpp | 1 + .../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 1 + offload/plugins-nextgen/cuda/src/rtl.cpp | 29 +++++++++++++++++-- .../offloading/CUDA/basic_launch_multi_arg.cu | 3 -- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index e7a1ca38b3c13..f630e8d850706 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -81,6 +81,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2) DLWRAP(cuDevicePrimaryCtxRetain, 2) DLWRAP(cuModuleLoadDataEx, 5) DLWRAP(cuOccupancyMaxPotentialBlockSize, 6) +DLWRAP(cuFuncGetParamInfo, 4) DLWRAP(cuDeviceCanAccessPeer, 3) DLWRAP(cuCtxEnablePeerAccess, 2) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index a470d6df1079d..7e42c66dddabb 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -390,5 +390,6 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity, CUmemAllocationGranularity_flags option); CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int); +CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *); #endif diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index a27c6f3de0cd3..6ac48255693d8 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -149,7 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy { // The maximum number of threads cannot exceed the maximum of the kernel. MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads); - return Plugin::success(); + // Retrieve the size of the arguments. + return initArgsSize(); } /// Launch the CUDA kernel function. @@ -173,11 +174,29 @@ struct CUDAKernelTy : public GenericKernelTy { } private: + /// Initialize the size of the arguments. + Error initArgsSize() { + CUresult Res; + size_t ArgOffset, ArgSize; + size_t Arg = 0; + + // Find the last argument to know the total size of the arguments. + while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) == CUDA_SUCCESS) + ArgsSize = ArgOffset + ArgSize; + + if (Res != CUDA_ERROR_INVALID_VALUE) + return Plugin::check(Res, "error in cuFuncGetParamInfo: %s"); + return Plugin::success(); + } + /// The CUDA kernel function to execute. CUfunction Func; /// The maximum amount of dynamic shared memory per thread group. By default, /// this is set to 48 KB. mutable uint32_t MaxDynCGroupMemLimit = 49152; + + /// The size of the kernel arguments. + size_t ArgsSize = 0; }; /// Class wrapping a CUDA stream reference. These are the objects handled by the @@ -1430,6 +1449,11 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, AsyncInfoWrapperTy &AsyncInfoWrapper) const { CUDADeviceTy &CUDADevice = static_cast(GenericDevice); + // The args size passed in LaunchParams may have tail padding, which is not + // accepted by the CUDA driver. + if (ArgsSize > LaunchParams.Size) + return Plugin::error(ErrorCode::INVALID_BINARY, "mismatch in kernel arguments"); + CUstream Stream; if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream)) return Err; @@ -1437,9 +1461,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t MaxDynCGroupMem = std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize()); + size_t ConfigArgsSize = ArgsSize; void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data, CU_LAUNCH_PARAM_BUFFER_SIZE, - reinterpret_cast(&LaunchParams.Size), + reinterpret_cast(&ConfigArgsSize), CU_LAUNCH_PARAM_END}; // If we are running an RPC server we want to wake up the server thread diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu index 7a32983f51f7c..4e0f3a41a7a0c 100644 --- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu +++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu @@ -6,9 +6,6 @@ // clang-format on // REQUIRES: gpu -// -// FIXME: https://github.com/llvm/llvm-project/issues/161265 -// UNSUPPORTED: gpu #include From cdd16a7392a8bca067a6dcfec98f674c63be5e84 Mon Sep 17 00:00:00 2001 From: Kevin Sala Date: Sun, 14 Dec 2025 20:30:00 -0800 Subject: [PATCH 2/4] Fix format --- offload/plugins-nextgen/cuda/src/rtl.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 6ac48255693d8..a60c3ca032d76 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -181,7 +181,8 @@ struct CUDAKernelTy : public GenericKernelTy { size_t Arg = 0; // Find the last argument to know the total size of the arguments. - while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) == CUDA_SUCCESS) + while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) == + CUDA_SUCCESS) ArgsSize = ArgOffset + ArgSize; if (Res != CUDA_ERROR_INVALID_VALUE) @@ -1452,7 +1453,8 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // The args size passed in LaunchParams may have tail padding, which is not // accepted by the CUDA driver. if (ArgsSize > LaunchParams.Size) - return Plugin::error(ErrorCode::INVALID_BINARY, "mismatch in kernel arguments"); + return Plugin::error(ErrorCode::INVALID_BINARY, + "mismatch in kernel arguments"); CUstream Stream; if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream)) From 2bbfdd22234b4ed4b8b4b7dd707413798e366ea3 Mon Sep 17 00:00:00 2001 From: Kevin Sala Date: Sun, 14 Dec 2025 20:37:39 -0800 Subject: [PATCH 3/4] Fix comments --- offload/plugins-nextgen/cuda/src/rtl.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index a60c3ca032d76..3c41694bf9dc4 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -180,6 +180,8 @@ struct CUDAKernelTy : public GenericKernelTy { size_t ArgOffset, ArgSize; size_t Arg = 0; + ArgsSize = 0; + // Find the last argument to know the total size of the arguments. while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) == CUDA_SUCCESS) @@ -197,7 +199,7 @@ struct CUDAKernelTy : public GenericKernelTy { mutable uint32_t MaxDynCGroupMemLimit = 49152; /// The size of the kernel arguments. - size_t ArgsSize = 0; + size_t ArgsSize; }; /// Class wrapping a CUDA stream reference. These are the objects handled by the @@ -1453,7 +1455,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // The args size passed in LaunchParams may have tail padding, which is not // accepted by the CUDA driver. if (ArgsSize > LaunchParams.Size) - return Plugin::error(ErrorCode::INVALID_BINARY, + return Plugin::error(ErrorCode::INVALID_ARGUMENT, "mismatch in kernel arguments"); CUstream Stream; From 8934542d9a84787d56b9757fc1596c516dd6f465 Mon Sep 17 00:00:00 2001 From: Kevin Sala Date: Sun, 14 Dec 2025 21:11:55 -0800 Subject: [PATCH 4/4] Add offload test with multiple kernel args --- .../OffloadAPI/device_code/CMakeLists.txt | 2 ++ .../unittests/OffloadAPI/device_code/multiargs.cpp | 3 +++ .../unittests/OffloadAPI/kernel/olLaunchKernel.cpp | 14 ++++++++++++++ 3 files changed, 19 insertions(+) create mode 100644 offload/unittests/OffloadAPI/device_code/multiargs.cpp diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt index 1a042e1b38315..22ebacf62e83e 100644 --- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt @@ -2,6 +2,7 @@ add_offload_test_device_code(foo.cpp foo) add_offload_test_device_code(bar.cpp bar) # Compile with optimizations to eliminate AMDGPU implicit arguments. add_offload_test_device_code(noargs.cpp noargs -O3) +add_offload_test_device_code(multiargs.cpp multiargs -O3) add_offload_test_device_code(byte.cpp byte) add_offload_test_device_code(localmem.cpp localmem) add_offload_test_device_code(localmem_reduction.cpp localmem_reduction) @@ -15,6 +16,7 @@ add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin + multiargs.bin byte.bin localmem.bin localmem_reduction.bin diff --git a/offload/unittests/OffloadAPI/device_code/multiargs.cpp b/offload/unittests/OffloadAPI/device_code/multiargs.cpp new file mode 100644 index 0000000000000..265dad124e91e --- /dev/null +++ b/offload/unittests/OffloadAPI/device_code/multiargs.cpp @@ -0,0 +1,3 @@ +#include + +extern "C" __gpu_kernel void multiargs(char, int *, short) { (void)0; } diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp index c9eca36a4d447..0845b9a1afdb7 100644 --- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp +++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp @@ -55,6 +55,7 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase { KERNEL_TEST(Foo, foo) KERNEL_TEST(NoArgs, noargs) +KERNEL_TEST(MultiArgs, multiargs) KERNEL_TEST(Byte, byte) KERNEL_TEST(LocalMem, localmem) KERNEL_TEST(LocalMemReduction, localmem_reduction) @@ -135,6 +136,19 @@ TEST_P(olLaunchKernelNoArgsTest, Success) { ASSERT_SUCCESS(olSyncQueue(Queue)); } +TEST_P(olLaunchKernelMultiTest, Success) { + struct { + char A; + int *B; + short C; + } Args{0, nullptr, 0}; + + ASSERT_SUCCESS( + olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs)); + + ASSERT_SUCCESS(olSyncQueue(Queue)); +} + TEST_P(olLaunchKernelFooTest, SuccessSynchronous) { void *Mem; ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,