[SYCL] Extract args directly from kernel if we can (intel#18387)

Pennycook · web-flow · commit 1f76dfdfb8b1 · 2025-05-16T07:59:00.000-07:00
In some cases, all values that need to be passed as kernel arguments are stored within the kernel function object, and their offsets can be calculated using the integration header or equivalent built-ins. In such cases, we can therefore set kernel arguments directly without staging via `MArgs`. This first attempt is limited to the simplest cases where all kernel arguments are either standard layout types or pointers. It may be possible to extend this approach to cover other cases, but only if some classes are redesigned. The implementation currently stores some information (e.g., the number of kernel arguments) inside of the handler, because there is no way to access the kernel type within `handler::finalize()`. --- Some notes for reviewers: - This depends on the new `hasSpecialCaptures` functionality introduced in intel#18386, which returns `true` for kernels that only capture standard layout classes and pointers. - There are some seemingly unrelated changes in kernel_desc.hpp and to some of the unit tests. These changes were necessary because `hasSpecialCaptures` requires `getParamDesc` to be `constexpr`. I think this wasn't picked up during intel#18386 because `hasSpecialCaptures` wasn't previously being run for every kernel. - I'm not really satisfied by the solution of adding a lot more member variables, but it was the best way I could think of to limit the scope of the changes required. Long-term, it would be better to try and move everything (including the complicated cases) to extract everything directly from the lambda, to design an abstraction that unifies the `MArgs` and `MKernelFuncPtr` paths, or to find a way to access the required values without them being stored in the handler (e.g., using something like intel#18081). --- This change was motivated by profiling of the `SubmitKernel` benchmark in the https://github.com/intel/compute-benchmarks/ suite, which can be run similar to: ``` /build/bin/api_overhead_benchmark_sycl --test=SubmitKernel --csv --noHeaders --Ioq=1 --MeasureCompletion=0 --iterations=100000 --Profiling=0 --NumKernels=10 --KernelExecTime=1 --UseEvents=0 ``` This is the simplest submission case there is, appending a kernel with no special arguments to an in-order queue. In the benchmarks on my machine, I saw around 1-2% of execution time spent in calls to `extractArgsAndReqsFromLambda`, attributed to populating the `MArgs` vector using information from the integration headers. This PR removes the need to call `extractArgsAndReqsFromLambda` entirely in the submission path used by this benchmark, thus improving performance. --------- Signed-off-by: John Pennycook <john.pennycook@intel.com>
diff --git a/sycl/include/sycl/detail/kernel_desc.hpp b/sycl/include/sycl/detail/kernel_desc.hpp
@@ -262,7 +262,7 @@ template <typename KernelNameType> constexpr int64_t getKernelSize() {
 
 template <typename KernelNameType> constexpr bool hasSpecialCaptures() {
   bool FoundSpecialCapture = false;
-  for (int I = 0; I < getKernelNumParams<KernelNameType>(); ++I) {
+  for (unsigned I = 0; I < getKernelNumParams<KernelNameType>(); ++I) {
     auto ParamDesc = getKernelParamDesc<KernelNameType>(I);
     bool IsSpecialCapture =
         (ParamDesc.kind != kernel_param_kind_t::kind_std_layout &&
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
@@ -751,8 +751,8 @@ class __SYCL_EXPORT handler {
 
   /// Stores lambda to the template-free object
   ///
-  /// Also initializes kernel name, list of arguments and requirements using
-  /// information from the integration header/built-ins.
+  /// Also initializes the kernel name and prepares for arguments to
+  /// be extracted from the lambda in handler::finalize().
   ///
   /// \param KernelFunc is a SYCL kernel function
   /// \param ParamDescs is the vector of kernel parameter descriptors.
@@ -798,11 +798,13 @@ class __SYCL_EXPORT handler {
     if constexpr (KernelHasName) {
       // TODO support ESIMD in no-integration-header case too.
 
-      clearArgs();
-      extractArgsAndReqsFromLambda(MHostKernel->getPtr(),
-                                   &(detail::getKernelParamDesc<KernelName>),
-                                   detail::getKernelNumParams<KernelName>(),
-                                   detail::isKernelESIMD<KernelName>());
+      // Force hasSpecialCaptures to be evaluated at compile-time.
+      constexpr bool HasSpecialCapt = detail::hasSpecialCaptures<KernelName>();
+      setKernelInfo((void *)MHostKernel->getPtr(),
+                    detail::getKernelNumParams<KernelName>(),
+                    &(detail::getKernelParamDesc<KernelName>),
+                    detail::isKernelESIMD<KernelName>(), HasSpecialCapt);
+
       MKernelName = detail::getKernelName<KernelName>();
     } else {
       // In case w/o the integration header it is necessary to process
@@ -3764,6 +3766,10 @@ class __SYCL_EXPORT handler {
                                   sycl::range<3> LocalSize, sycl::id<3> Offset,
                                   int Dims);
 
+  void setKernelInfo(void *KernelFuncPtr, int KernelNumArgs,
+                     detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
+                     bool KernelIsESIMD, bool KernelHasSpecialCaptures);
+
   friend class detail::HandlerAccess;
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp
@@ -198,6 +198,13 @@ class handler_impl {
 
   // Allocation ptr to be freed asynchronously.
   void *MFreePtr = nullptr;
+
+  // Store information about the kernel arguments.
+  void *MKernelFuncPtr = nullptr;
+  int MKernelNumArgs = 0;
+  detail::kernel_param_desc_t (*MKernelParamDescGetter)(int) = nullptr;
+  bool MKernelIsESIMD = false;
+  bool MKernelHasSpecialCaptures = true;
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
@@ -2385,7 +2385,10 @@ static ur_result_t SetKernelParamsAndLaunch(
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     bool IsCooperative, bool KernelUsesClusterLaunch,
     uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage,
-    KernelNameStrRefT KernelName) {
+    KernelNameStrRefT KernelName, void *KernelFuncPtr = nullptr,
+    int KernelNumArgs = 0,
+    detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr,
+    bool KernelHasSpecialCaptures = true) {
   assert(Queue && "Kernel submissions should have an associated queue");
   const AdapterPtr &Adapter = Queue->getAdapter();
 
@@ -2397,13 +2400,38 @@ static ur_result_t SetKernelParamsAndLaunch(
                               : Empty);
   }
 
-  auto setFunc = [&Adapter, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
-                  &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
-    SetArgBasedOnType(Adapter, Kernel, DeviceImageImpl, getMemAllocationFunc,
-                      Queue->getContextImplPtr(), Arg, NextTrueIndex);
-  };
-
-  applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
+  if (KernelFuncPtr && !KernelHasSpecialCaptures) {
+    auto setFunc = [&Adapter, Kernel,
+                    KernelFuncPtr](const detail::kernel_param_desc_t &ParamDesc,
+                                   size_t NextTrueIndex) {
+      const void *ArgPtr = (const char *)KernelFuncPtr + ParamDesc.offset;
+      switch (ParamDesc.kind) {
+      case kernel_param_kind_t::kind_std_layout: {
+        int Size = ParamDesc.info;
+        Adapter->call<UrApiKind::urKernelSetArgValue>(Kernel, NextTrueIndex,
+                                                      Size, nullptr, ArgPtr);
+        break;
+      }
+      case kernel_param_kind_t::kind_pointer: {
+        const void *Ptr = *static_cast<const void *const *>(ArgPtr);
+        Adapter->call<UrApiKind::urKernelSetArgPointer>(Kernel, NextTrueIndex,
+                                                        nullptr, Ptr);
+        break;
+      }
+      default:
+        throw std::runtime_error("Direct kernel argument copy failed.");
+      }
+    };
+    applyFuncOnFilteredArgs(EliminatedArgMask, KernelNumArgs,
+                            KernelParamDescGetter, setFunc);
+  } else {
+    auto setFunc = [&Adapter, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
+                    &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
+      SetArgBasedOnType(Adapter, Kernel, DeviceImageImpl, getMemAllocationFunc,
+                        Queue->getContextImplPtr(), Arg, NextTrueIndex);
+    };
+    applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
+  }
 
   std::optional<int> ImplicitLocalArg =
       ProgramManager::getInstance().kernelImplicitLocalArgPos(KernelName);
@@ -2655,7 +2683,9 @@ void enqueueImpKernel(
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative,
     const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize,
-    const RTDeviceBinaryImage *BinImage) {
+    const RTDeviceBinaryImage *BinImage, void *KernelFuncPtr, int KernelNumArgs,
+    detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
+    bool KernelHasSpecialCaptures) {
   assert(Queue && "Kernel submissions should have an associated queue");
   // Run OpenCL kernel
   auto &ContextImpl = Queue->getContextImplPtr();
@@ -2739,7 +2769,8 @@ void enqueueImpKernel(
         Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
         OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
         KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize,
-        BinImage, KernelName);
+        BinImage, KernelName, KernelFuncPtr, KernelNumArgs,
+        KernelParamDescGetter, KernelHasSpecialCaptures);
 
     const AdapterPtr &Adapter = Queue->getAdapter();
     if (!SyclKernelImpl && !MSyclKernel) {
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
@@ -626,7 +626,10 @@ void enqueueImpKernel(
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     ur_kernel_cache_config_t KernelCacheConfig, bool KernelIsCooperative,
     const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize,
-    const RTDeviceBinaryImage *BinImage = nullptr);
+    const RTDeviceBinaryImage *BinImage = nullptr,
+    void *KernelFuncPtr = nullptr, int KernelNumArgs = 0,
+    detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr,
+    bool KernelHasSpecialCaptures = true);
 
 /// The exec CG command enqueues execution of kernel or explicit memory
 /// operation.
@@ -780,6 +783,27 @@ void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask,
   }
 }
 
+template <typename FuncT>
+void applyFuncOnFilteredArgs(
+    const KernelArgMask *EliminatedArgMask, int KernelNumArgs,
+    detail::kernel_param_desc_t (*KernelParamDescGetter)(int), FuncT Func) {
+  if (!EliminatedArgMask || EliminatedArgMask->size() == 0) {
+    for (int I = 0; I < KernelNumArgs; ++I) {
+      const detail::kernel_param_desc_t &Param = KernelParamDescGetter(I);
+      Func(Param, I);
+    }
+  } else {
+    size_t NextTrueIndex = 0;
+    for (int I = 0; I < KernelNumArgs; ++I) {
+      const detail::kernel_param_desc_t &Param = KernelParamDescGetter(I);
+      if ((*EliminatedArgMask)[I])
+        continue;
+      Func(Param, NextTrueIndex);
+      ++NextTrueIndex;
+    }
+  }
+}
+
 void ReverseRangeDimensionsForKernel(NDRDescT &NDR);
 
 } // namespace detail
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
@@ -411,6 +411,24 @@ event handler::finalize() {
     return MLastEvent;
   MIsFinalized = true;
 
+  const auto &type = getType();
+  const bool KernelFastPath =
+      (MQueue && !impl->MGraph && !impl->MSubgraphNode &&
+       !MQueue->hasCommandGraph() && !impl->CGData.MRequirements.size() &&
+       !MStreamStorage.size() &&
+       detail::Scheduler::areEventsSafeForSchedulerBypass(
+           impl->CGData.MEvents, MQueue->getContextImplPtr()));
+
+  // Extract arguments from the kernel lambda, if required.
+  // Skipping this is currently limited to simple kernels on the fast path.
+  if (type == detail::CGType::Kernel && impl->MKernelFuncPtr &&
+      (!KernelFastPath || impl->MKernelHasSpecialCaptures)) {
+    clearArgs();
+    extractArgsAndReqsFromLambda((char *)impl->MKernelFuncPtr,
+                                 impl->MKernelParamDescGetter,
+                                 impl->MKernelNumArgs, impl->MKernelIsESIMD);
+  }
+
   // According to 4.7.6.9 of SYCL2020 spec, if a placeholder accessor is passed
   // to a command without being bound to a command group, an exception should
   // be thrown.
@@ -448,7 +466,6 @@ event handler::finalize() {
     }
   }
 
-  const auto &type = getType();
   if (type == detail::CGType::Kernel) {
     // If there were uses of set_specialization_constant build the kernel_bundle
     std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImpPtr =
@@ -507,11 +524,7 @@ event handler::finalize() {
       }
     }
 
-    if (MQueue && !impl->MGraph && !impl->MSubgraphNode &&
-        !MQueue->hasCommandGraph() && !impl->CGData.MRequirements.size() &&
-        !MStreamStorage.size() &&
-        detail::Scheduler::areEventsSafeForSchedulerBypass(
-            impl->CGData.MEvents, MQueue->getContextImplPtr())) {
+    if (KernelFastPath) {
       // if user does not add a new dependency to the dependency graph, i.e.
       // the graph is not changed, then this faster path is used to submit
       // kernel bypassing scheduler and avoiding CommandGroup, Command objects
@@ -557,13 +570,14 @@ event handler::finalize() {
               detail::retrieveKernelBinary(MQueue, MKernelName.data());
           assert(BinImage && "Failed to obtain a binary image.");
         }
-        enqueueImpKernel(MQueue, impl->MNDRDesc, impl->MArgs,
-                         KernelBundleImpPtr, MKernel.get(), MKernelName.data(),
-                         RawEvents,
-                         DiscardEvent ? nullptr : LastEventImpl.get(), nullptr,
-                         impl->MKernelCacheConfig, impl->MKernelIsCooperative,
-                         impl->MKernelUsesClusterLaunch,
-                         impl->MKernelWorkGroupMemorySize, BinImage);
+        enqueueImpKernel(
+            MQueue, impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr,
+            MKernel.get(), MKernelName.data(), RawEvents,
+            DiscardEvent ? nullptr : LastEventImpl.get(), nullptr,
+            impl->MKernelCacheConfig, impl->MKernelIsCooperative,
+            impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize,
+            BinImage, impl->MKernelFuncPtr, impl->MKernelNumArgs,
+            impl->MKernelParamDescGetter, impl->MKernelHasSpecialCaptures);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
         if (xptiEnabled) {
           // Emit signal only when event is created
@@ -2260,6 +2274,17 @@ void handler::setNDRangeDescriptorPadded(sycl::range<3> NumWorkItems,
   impl->MNDRDesc = NDRDescT{NumWorkItems, LocalSize, Offset, Dims};
 }
 
+void handler::setKernelInfo(
+    void *KernelFuncPtr, int KernelNumArgs,
+    detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
+    bool KernelIsESIMD, bool KernelHasSpecialCaptures) {
+  impl->MKernelFuncPtr = KernelFuncPtr;
+  impl->MKernelNumArgs = KernelNumArgs;
+  impl->MKernelParamDescGetter = KernelParamDescGetter;
+  impl->MKernelIsESIMD = KernelIsESIMD;
+  impl->MKernelHasSpecialCaptures = KernelHasSpecialCaptures;
+}
+
 void handler::saveCodeLoc(detail::code_location CodeLoc, bool IsTopCodeLoc) {
   MCodeLoc = CodeLoc;
   impl->MIsTopCodeLoc = IsTopCodeLoc;
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3524,6 +3524,7 @@ _ZN4sycl3_V17handler12setArgHelperEiRNS0_3ext6oneapi12experimental6detail30dynam
 _ZN4sycl3_V17handler12setArgHelperEiRNS0_6detail22work_group_memory_implE
 _ZN4sycl3_V17handler12setArgHelperEiONS0_6streamE
 _ZN4sycl3_V17handler13getKernelNameEv
+_ZN4sycl3_V17handler13setKernelInfoEPviPFNS0_6detail19kernel_param_desc_tEiEbb
 _ZN4sycl3_V17handler14addAccessorReqESt10shared_ptrINS0_6detail16AccessorImplHostEE
 _ZN4sycl3_V17handler14setNDRangeUsedEb
 _ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleENS0_5rangeILi3EEERKNS4_16image_descriptorEPvS7_S7_S7_
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
@@ -4378,6 +4378,7 @@
 ?setHandlerKernelBundle@handler@_V1@sycl@@AEAAXVkernel@23@@Z
 ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4StableKernelCacheConfig@123@@Z
 ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$02@23@H@Z
+?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N2@Z
 ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z
 ?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z
 ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z
diff --git a/sycl/unittests/Extensions/USMMemcpy2D.cpp b/sycl/unittests/Extensions/USMMemcpy2D.cpp
diff --git a/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp b/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp
diff --git a/sycl/unittests/buffer/KernelArgMemObj.cpp b/sycl/unittests/buffer/KernelArgMemObj.cpp
diff --git a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp