Skip to content

Commit 819e7fd

Browse files
authored
Revert "[offload] Fix CUDA args size by subtracting tail padding (#172249)"
This reverts commit 35315a8.
1 parent a88498f commit 819e7fd

File tree

7 files changed

+5
-52
lines changed

7 files changed

+5
-52
lines changed

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
8181
DLWRAP(cuDevicePrimaryCtxRetain, 2)
8282
DLWRAP(cuModuleLoadDataEx, 5)
8383
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
84-
DLWRAP(cuFuncGetParamInfo, 4)
8584

8685
DLWRAP(cuDeviceCanAccessPeer, 3)
8786
DLWRAP(cuCtxEnablePeerAccess, 2)

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,5 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
390390
CUmemAllocationGranularity_flags option);
391391
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
392392
CUoccupancyB2DSize, size_t, int);
393-
CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
394393

395394
#endif

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 2 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,7 @@ struct CUDAKernelTy : public GenericKernelTy {
149149
// The maximum number of threads cannot exceed the maximum of the kernel.
150150
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
151151

152-
// Retrieve the size of the arguments.
153-
return initArgsSize();
152+
return Plugin::success();
154153
}
155154

156155
/// Launch the CUDA kernel function.
@@ -174,32 +173,11 @@ struct CUDAKernelTy : public GenericKernelTy {
174173
}
175174

176175
private:
177-
/// Initialize the size of the arguments.
178-
Error initArgsSize() {
179-
CUresult Res;
180-
size_t ArgOffset, ArgSize;
181-
size_t Arg = 0;
182-
183-
ArgsSize = 0;
184-
185-
// Find the last argument to know the total size of the arguments.
186-
while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
187-
CUDA_SUCCESS)
188-
ArgsSize = ArgOffset + ArgSize;
189-
190-
if (Res != CUDA_ERROR_INVALID_VALUE)
191-
return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
192-
return Plugin::success();
193-
}
194-
195176
/// The CUDA kernel function to execute.
196177
CUfunction Func;
197178
/// The maximum amount of dynamic shared memory per thread group. By default,
198179
/// this is set to 48 KB.
199180
mutable uint32_t MaxDynCGroupMemLimit = 49152;
200-
201-
/// The size of the kernel arguments.
202-
size_t ArgsSize;
203181
};
204182

205183
/// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1452,23 +1430,16 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
14521430
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
14531431
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
14541432

1455-
// The args size passed in LaunchParams may have tail padding, which is not
1456-
// accepted by the CUDA driver.
1457-
if (ArgsSize > LaunchParams.Size)
1458-
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1459-
"mismatch in kernel arguments");
1460-
14611433
CUstream Stream;
14621434
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
14631435
return Err;
14641436

14651437
uint32_t MaxDynCGroupMem =
14661438
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
14671439

1468-
size_t ConfigArgsSize = ArgsSize;
14691440
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
14701441
CU_LAUNCH_PARAM_BUFFER_SIZE,
1471-
reinterpret_cast<void *>(&ConfigArgsSize),
1442+
reinterpret_cast<void *>(&LaunchParams.Size),
14721443
CU_LAUNCH_PARAM_END};
14731444

14741445
// If we are running an RPC server we want to wake up the server thread

offload/test/offloading/CUDA/basic_launch_multi_arg.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
// clang-format on
77

88
// REQUIRES: gpu
9+
//
10+
// FIXME: https://github.com/llvm/llvm-project/issues/161265
11+
// UNSUPPORTED: gpu
912

1013
#include <stdio.h>
1114

offload/unittests/OffloadAPI/device_code/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ add_offload_test_device_code(foo.cpp foo)
22
add_offload_test_device_code(bar.cpp bar)
33
# Compile with optimizations to eliminate AMDGPU implicit arguments.
44
add_offload_test_device_code(noargs.cpp noargs -O3)
5-
add_offload_test_device_code(multiargs.cpp multiargs -O3)
65
add_offload_test_device_code(byte.cpp byte)
76
add_offload_test_device_code(localmem.cpp localmem)
87
add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
@@ -16,7 +15,6 @@ add_custom_target(offload_device_binaries DEPENDS
1615
foo.bin
1716
bar.bin
1817
noargs.bin
19-
multiargs.bin
2018
byte.bin
2119
localmem.bin
2220
localmem_reduction.bin

offload/unittests/OffloadAPI/device_code/multiargs.cpp

Lines changed: 0 additions & 3 deletions
This file was deleted.

offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
5555

5656
KERNEL_TEST(Foo, foo)
5757
KERNEL_TEST(NoArgs, noargs)
58-
KERNEL_TEST(MultiArgs, multiargs)
5958
KERNEL_TEST(Byte, byte)
6059
KERNEL_TEST(LocalMem, localmem)
6160
KERNEL_TEST(LocalMemReduction, localmem_reduction)
@@ -136,19 +135,6 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
136135
ASSERT_SUCCESS(olSyncQueue(Queue));
137136
}
138137

139-
TEST_P(olLaunchKernelMultiTest, Success) {
140-
struct {
141-
char A;
142-
int *B;
143-
short C;
144-
} Args{0, nullptr, 0};
145-
146-
ASSERT_SUCCESS(
147-
olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs));
148-
149-
ASSERT_SUCCESS(olSyncQueue(Queue));
150-
}
151-
152138
TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
153139
void *Mem;
154140
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,

0 commit comments

Comments
 (0)