Revert "[offload] Fix CUDA args size by subtracting tail padding (#172249)"

mgorny · web-flow · commit 819e7fd3b29a · 2025-12-21T19:56:21.000+01:00
This reverts commit 35315a8.
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -81,7 +81,6 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
 DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
-DLWRAP(cuFuncGetParamInfo, 4)
 
 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -390,6 +390,5 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        CUmemAllocationGranularity_flags option);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
                                           CUoccupancyB2DSize, size_t, int);
-CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -149,8 +149,7 @@ struct CUDAKernelTy : public GenericKernelTy {
     // The maximum number of threads cannot exceed the maximum of the kernel.
     MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
 
-    // Retrieve the size of the arguments.
-    return initArgsSize();
+    return Plugin::success();
   }
 
   /// Launch the CUDA kernel function.
@@ -174,32 +173,11 @@ struct CUDAKernelTy : public GenericKernelTy {
   }
 
 private:
-  /// Initialize the size of the arguments.
-  Error initArgsSize() {
-    CUresult Res;
-    size_t ArgOffset, ArgSize;
-    size_t Arg = 0;
-
-    ArgsSize = 0;
-
-    // Find the last argument to know the total size of the arguments.
-    while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
-           CUDA_SUCCESS)
-      ArgsSize = ArgOffset + ArgSize;
-
-    if (Res != CUDA_ERROR_INVALID_VALUE)
-      return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
-    return Plugin::success();
-  }
-
   /// The CUDA kernel function to execute.
   CUfunction Func;
   /// The maximum amount of dynamic shared memory per thread group. By default,
   /// this is set to 48 KB.
   mutable uint32_t MaxDynCGroupMemLimit = 49152;
-
-  /// The size of the kernel arguments.
-  size_t ArgsSize;
 };
 
 /// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1452,23 +1430,16 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
 
-  // The args size passed in LaunchParams may have tail padding, which is not
-  // accepted by the CUDA driver.
-  if (ArgsSize > LaunchParams.Size)
-    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                         "mismatch in kernel arguments");
-
   CUstream Stream;
   if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
 
   uint32_t MaxDynCGroupMem =
       std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
 
-  size_t ConfigArgsSize = ArgsSize;
   void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
                     CU_LAUNCH_PARAM_BUFFER_SIZE,
-                    reinterpret_cast<void *>(&ConfigArgsSize),
+                    reinterpret_cast<void *>(&LaunchParams.Size),
                     CU_LAUNCH_PARAM_END};
 
   // If we are running an RPC server we want to wake up the server thread
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -6,6 +6,9 @@
 // clang-format on
 
 // REQUIRES: gpu
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
 
 #include <stdio.h>
 
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -2,7 +2,6 @@ add_offload_test_device_code(foo.cpp foo)
 add_offload_test_device_code(bar.cpp bar)
 # Compile with optimizations to eliminate AMDGPU implicit arguments.
 add_offload_test_device_code(noargs.cpp noargs -O3)
-add_offload_test_device_code(multiargs.cpp multiargs -O3)
 add_offload_test_device_code(byte.cpp byte)
 add_offload_test_device_code(localmem.cpp localmem)
 add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
@@ -16,7 +15,6 @@ add_custom_target(offload_device_binaries DEPENDS
     foo.bin
     bar.bin
     noargs.bin
-    multiargs.bin
     byte.bin
     localmem.bin
     localmem_reduction.bin
diff --git a/offload/unittests/OffloadAPI/device_code/multiargs.cpp b/offload/unittests/OffloadAPI/device_code/multiargs.cpp
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -55,7 +55,6 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
 
 KERNEL_TEST(Foo, foo)
 KERNEL_TEST(NoArgs, noargs)
-KERNEL_TEST(MultiArgs, multiargs)
 KERNEL_TEST(Byte, byte)
 KERNEL_TEST(LocalMem, localmem)
 KERNEL_TEST(LocalMemReduction, localmem_reduction)
@@ -136,19 +135,6 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
   ASSERT_SUCCESS(olSyncQueue(Queue));
 }
 
-TEST_P(olLaunchKernelMultiTest, Success) {
-  struct {
-    char A;
-    int *B;
-    short C;
-  } Args{0, nullptr, 0};
-
-  ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs));
-
-  ASSERT_SUCCESS(olSyncQueue(Queue));
-}
-
 TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
   void *Mem;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,