oneapi-src
diff --git a/‎include/ur_api.h‎
Lines changed: 7 additions & 2 deletions b/‎include/ur_api.h‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎scripts/core/exp-command-buffer.yml‎
Lines changed: 7 additions & 3 deletions b/‎scripts/core/exp-command-buffer.yml‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎source/adapters/cuda/command_buffer.cpp‎
Lines changed: 38 additions & 21 deletions b/‎source/adapters/cuda/command_buffer.cpp‎
Lines changed: 38 additions & 21 deletions
diff --git a/‎source/adapters/cuda/command_buffer.hpp‎
Lines changed: 10 additions & 1 deletion b/‎source/adapters/cuda/command_buffer.hpp‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎source/adapters/mock/ur_mockddi.cpp‎
Lines changed: 3 additions & 2 deletions b/‎source/adapters/mock/ur_mockddi.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎source/loader/layers/tracing/ur_trcddi.cpp‎
Lines changed: 3 additions & 2 deletions b/‎source/loader/layers/tracing/ur_trcddi.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎source/loader/layers/validation/ur_valddi.cpp‎
Lines changed: 15 additions & 2 deletions b/‎source/loader/layers/validation/ur_valddi.cpp‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎source/loader/ur_ldrddi.cpp‎
Lines changed: 3 additions & 2 deletions b/‎source/loader/ur_ldrddi.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎source/loader/ur_libapi.cpp‎
Lines changed: 7 additions & 2 deletions b/‎source/loader/ur_libapi.cpp‎
Lines changed: 7 additions & 2 deletions
@@ -8391,6 +8391,9 @@ urCommandBufferFinalizeExp(
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///         + `phKernelAlternatives == NULL && numKernelAlternatives > 0`
+///         + `phKernelAlternatives != NULL && numKernelAlternatives == 0`
+///         + `phKernelAlternatives` contains `hKernel`
 ///     - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP
 ///     - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP
 ///         + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`
@@ -8406,10 +8409,11 @@ urCommandBufferAppendKernelLaunchExp(
     const size_t *pGlobalWorkSize,                                ///< [in] Global work size to use when executing kernel.
     const size_t *pLocalWorkSize,                                 ///< [in][optional] Local work size to use when executing kernel.
     uint32_t numKernelAlternatives,                               ///< [in] The number of kernel alternatives provided in
-                                                                  ///< pKernelAlternatives.
+                                                                  ///< phKernelAlternatives.
     ur_kernel_handle_t *phKernelAlternatives,                     ///< [in][optional][range(0, numKernelAlternatives)] List of kernels
                                                                   ///< handles that might be used to update the kernel in this
-                                                                  ///< command after the command-buffer is finalized.
+                                                                  ///< command after the command-buffer is finalized. It's invalid to specify
+                                                                  ///< the default kernel `hKernel` as part of this list.
     uint32_t numSyncPointsInWaitList,                             ///< [in] The number of sync points in the provided dependency list.
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May
                                                                   ///< be ignored if command-buffer is in-order.
@@ -8937,6 +8941,7 @@ urCommandBufferReleaseCommandExp(
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///         + If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of ::urCommandBufferAppendKernelLaunchExp when this command was created.
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 UR_APIEXPORT ur_result_t UR_APICALL
 
@@ -319,7 +319,7 @@ params:
       name: "phKernelAlternatives"
       desc: |
             [in][optional][range(0, numKernelAlternatives)] List of kernels handles that might be used to update the kernel in this 
-            command after the command-buffer is finalized.
+            command after the command-buffer is finalized. It's invalid to specify the default kernel `hKernel` as part of this list.
     - type: uint32_t
       name: numSyncPointsInWaitList
       desc: "[in] The number of sync points in the provided dependency list."
@@ -338,7 +338,10 @@ returns:
     - $X_RESULT_ERROR_INVALID_KERNEL
     - $X_RESULT_ERROR_INVALID_WORK_DIMENSION
     - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
-    - $X_RESULT_ERROR_INVALID_VALUE
+    - $X_RESULT_ERROR_INVALID_VALUE:
+        - "`phKernelAlternatives == NULL && numKernelAlternatives > 0`"
+        - "`phKernelAlternatives != NULL && numKernelAlternatives == 0`"
+        - "`phKernelAlternatives` contains `hKernel`"
     - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP
     - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP:
         - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`"
@@ -939,7 +942,8 @@ returns:
     - $X_RESULT_ERROR_INVALID_ENUMERATION
     - $X_RESULT_ERROR_INVALID_WORK_DIMENSION
     - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
-    - $X_RESULT_ERROR_INVALID_VALUE
+    - $X_RESULT_ERROR_INVALID_VALUE:
+        - "If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of $xCommandBufferAppendKernelLaunchExp when this command was created."
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
 --- #--------------------------------------------------------------------------
 
@@ -76,9 +76,11 @@ ur_exp_command_buffer_command_handle_t_::
         ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
         CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim,
         const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr,
-        const size_t *LocalWorkSizePtr)
-    : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params),
-      WorkDim(WorkDim), RefCountInternal(1), RefCountExternal(1) {
+        const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives,
+        ur_kernel_handle_t *KernelAlternatives)
+    : CommandBuffer(CommandBuffer), Kernel(Kernel), ValidKernelHandles(),
+      Node(Node), Params(Params), WorkDim(WorkDim), RefCountInternal(1),
+      RefCountExternal(1) {
   CommandBuffer->incrementInternalReferenceCount();
 
   const size_t CopySize = sizeof(size_t) * WorkDim;
@@ -96,6 +98,13 @@ ur_exp_command_buffer_command_handle_t_::
     std::memset(GlobalWorkOffset + WorkDim, 0, ZeroSize);
     std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize);
   }
+
+  /* Add the default Kernel as a valid kernel handle for this command */
+  ValidKernelHandles.insert(Kernel);
+  if (KernelAlternatives) {
+    ValidKernelHandles.insert(KernelAlternatives,
+                              KernelAlternatives + NumKernelAlternatives);
+  }
 }
 
 /// Helper function for finding the Cuda Nodes associated with the
@@ -344,8 +353,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel,
     uint32_t workDim, const size_t *pGlobalWorkOffset,
     const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t /*numKernelAlternatives*/,
-    ur_kernel_handle_t * /*phKernelAlternatives*/,
+    uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives,
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint,
@@ -356,6 +364,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
   UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
   UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 
+  for (uint32_t i = 0; i < numKernelAlternatives; ++i) {
+    UR_ASSERT(phKernelAlternatives[i] != hKernel,
+              UR_RESULT_ERROR_INVALID_VALUE);
+  }
+
   CUgraphNode GraphNode;
 
   std::vector<CUgraphNode> DepsList;
@@ -420,8 +433,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     }
 
     auto NewCommand = new ur_exp_command_buffer_command_handle_t_{
-        hCommandBuffer, hKernel,           GraphNode,       NodeParams,
-        workDim,        pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize};
+        hCommandBuffer,      hKernel,        GraphNode,
+        NodeParams,          workDim,        pGlobalWorkOffset,
+        pGlobalWorkSize,     pLocalWorkSize, numKernelAlternatives,
+        phKernelAlternatives};
 
     NewCommand->incrementInternalReferenceCount();
     hCommandBuffer->CommandHandles.push_back(NewCommand);
@@ -865,10 +880,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   }
 
   if (auto NewWorkDim = pUpdateKernelLaunch->newWorkDim) {
-    // Error if work dim changes
-    if (NewWorkDim != hCommand->WorkDim) {
-      return UR_RESULT_ERROR_INVALID_OPERATION;
-    }
 
     // Error If Local size and not global size
     if ((pUpdateKernelLaunch->pNewLocalWorkSize != nullptr) &&
@@ -888,7 +899,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   }
 
   // Kernel corresponding to the command to update
-  ur_kernel_handle_t Kernel = hCommand->Kernel;
+  ur_kernel_handle_t NewKernel = pUpdateKernelLaunch->hNewKernel;
+
+  if (hCommand->ValidKernelHandles.count(NewKernel)) {
+    hCommand->Kernel = NewKernel;
+  } else {
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
 
   // Update pointer arguments to the kernel
   uint32_t NumPointerArgs = pUpdateKernelLaunch->numNewPointerArgs;
@@ -901,7 +918,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
 
     ur_result_t Result = UR_RESULT_SUCCESS;
     try {
-      Kernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue);
+      NewKernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue);
     } catch (ur_result_t Err) {
       Result = Err;
       return Result;
@@ -920,11 +937,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     ur_result_t Result = UR_RESULT_SUCCESS;
     try {
       if (ArgValue == nullptr) {
-        Kernel->setKernelArg(ArgIndex, 0, nullptr);
+        NewKernel->setKernelArg(ArgIndex, 0, nullptr);
       } else {
         CUdeviceptr CuPtr =
             std::get<BufferMem>(ArgValue->Mem).getPtr(CommandBuffer->Device);
-        Kernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
+        NewKernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
       }
     } catch (ur_result_t Err) {
       Result = Err;
@@ -945,7 +962,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     ur_result_t Result = UR_RESULT_SUCCESS;
 
     try {
-      Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue);
+      NewKernel->setKernelArg(ArgIndex, ArgSize, ArgValue);
     } catch (ur_result_t Err) {
       Result = Err;
       return Result;
@@ -985,12 +1002,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   // by default unless user has provided a better number
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
-  CUfunction CuFunc = Kernel->get();
+  CUfunction CuFunc = NewKernel->get();
   ur_context_handle_t Context = CommandBuffer->Context;
   ur_device_handle_t Device = CommandBuffer->Device;
   auto Result = setKernelParams(Context, Device, WorkDim, GlobalWorkOffset,
-                                GlobalWorkSize, LocalWorkSize, Kernel, CuFunc,
-                                ThreadsPerBlock, BlocksPerGrid);
+                                GlobalWorkSize, LocalWorkSize, NewKernel,
+                                CuFunc, ThreadsPerBlock, BlocksPerGrid);
   if (Result != UR_RESULT_SUCCESS) {
     return Result;
   }
@@ -1004,8 +1021,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   Params.blockDimX = ThreadsPerBlock[0];
   Params.blockDimY = ThreadsPerBlock[1];
   Params.blockDimZ = ThreadsPerBlock[2];
-  Params.sharedMemBytes = Kernel->getLocalSize();
-  Params.kernelParams = const_cast<void **>(Kernel->getArgIndices().data());
+  Params.sharedMemBytes = NewKernel->getLocalSize();
+  Params.kernelParams = const_cast<void **>(NewKernel->getArgIndices().data());
 
   CUgraphNode Node = hCommand->Node;
   CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec;
 
@@ -16,6 +16,7 @@
 #include "logger/ur_logger.hpp"
 #include <cuda.h>
 #include <memory>
+#include <unordered_set>
 
 // Trace an internal UR call
 #define UR_TRACE(Call)                                                         \
@@ -44,7 +45,8 @@ struct ur_exp_command_buffer_command_handle_t_ {
       ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
       CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim,
       const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr,
-      const size_t *LocalWorkSizePtr);
+      const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives,
+      ur_kernel_handle_t *KernelAlternatives);
 
   void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) {
     const size_t CopySize = sizeof(size_t) * WorkDim;
@@ -96,7 +98,14 @@ struct ur_exp_command_buffer_command_handle_t_ {
   }
 
   ur_exp_command_buffer_handle_t CommandBuffer;
+
+  /* The currently active kernel handle for this command */
   ur_kernel_handle_t Kernel;
+
+  /* Set of all the kernel handles that can be used when updating this command
+   */
+  std::unordered_set<ur_kernel_handle_t> ValidKernelHandles;
+
   CUgraphNode Node;
   CUDA_KERNEL_NODE_PARAMS Params;
 
 
@@ -8352,11 +8352,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
         pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel.
     uint32_t
         numKernelAlternatives, ///< [in] The number of kernel alternatives provided in
-                               ///< pKernelAlternatives.
+                               ///< phKernelAlternatives.
     ur_kernel_handle_t *
         phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernels
     ///< handles that might be used to update the kernel in this
-    ///< command after the command-buffer is finalized.
+    ///< command after the command-buffer is finalized. It's invalid to specify
+    ///< the default kernel `hKernel` as part of this list.
     uint32_t
         numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list.
     const ur_exp_command_buffer_sync_point_t *
 
@@ -6496,11 +6496,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
         pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel.
     uint32_t
         numKernelAlternatives, ///< [in] The number of kernel alternatives provided in
-                               ///< pKernelAlternatives.
+                               ///< phKernelAlternatives.
     ur_kernel_handle_t *
         phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernels
     ///< handles that might be used to update the kernel in this
-    ///< command after the command-buffer is finalized.
+    ///< command after the command-buffer is finalized. It's invalid to specify
+    ///< the default kernel `hKernel` as part of this list.
     uint32_t
         numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list.
     const ur_exp_command_buffer_sync_point_t *
 
@@ -8057,11 +8057,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
         pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel.
     uint32_t
         numKernelAlternatives, ///< [in] The number of kernel alternatives provided in
-                               ///< pKernelAlternatives.
+                               ///< phKernelAlternatives.
     ur_kernel_handle_t *
         phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernels
     ///< handles that might be used to update the kernel in this
-    ///< command after the command-buffer is finalized.
+    ///< command after the command-buffer is finalized. It's invalid to specify
+    ///< the default kernel `hKernel` as part of this list.
     uint32_t
         numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list.
     const ur_exp_command_buffer_sync_point_t *
@@ -8096,6 +8097,18 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
+        if (phKernelAlternatives == NULL && numKernelAlternatives > 0) {
+            return UR_RESULT_ERROR_INVALID_VALUE;
+        }
+
+        if (phKernelAlternatives != NULL && numKernelAlternatives == 0) {
+            return UR_RESULT_ERROR_INVALID_VALUE;
+        }
+
+        if (phKernelAlternatives` contains `hKernel) {
+            return UR_RESULT_ERROR_INVALID_VALUE;
+        }
+
         if (pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0) {
             return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP;
         }
 
@@ -7108,11 +7108,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
         pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel.
     uint32_t
         numKernelAlternatives, ///< [in] The number of kernel alternatives provided in
-                               ///< pKernelAlternatives.
+                               ///< phKernelAlternatives.
     ur_kernel_handle_t *
         phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernels
     ///< handles that might be used to update the kernel in this
-    ///< command after the command-buffer is finalized.
+    ///< command after the command-buffer is finalized. It's invalid to specify
+    ///< the default kernel `hKernel` as part of this list.
     uint32_t
         numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list.
     const ur_exp_command_buffer_sync_point_t *
 
@@ -7525,6 +7525,9 @@ ur_result_t UR_APICALL urCommandBufferFinalizeExp(
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///         + `phKernelAlternatives == NULL && numKernelAlternatives > 0`
+///         + `phKernelAlternatives != NULL && numKernelAlternatives == 0`
+///         + `phKernelAlternatives` contains `hKernel`
 ///     - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP
 ///     - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP
 ///         + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`
@@ -7544,11 +7547,12 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
         pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel.
     uint32_t
         numKernelAlternatives, ///< [in] The number of kernel alternatives provided in
-                               ///< pKernelAlternatives.
+                               ///< phKernelAlternatives.
     ur_kernel_handle_t *
         phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernels
     ///< handles that might be used to update the kernel in this
-    ///< command after the command-buffer is finalized.
+    ///< command after the command-buffer is finalized. It's invalid to specify
+    ///< the default kernel `hKernel` as part of this list.
     uint32_t
         numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list.
     const ur_exp_command_buffer_sync_point_t *
@@ -8321,6 +8325,7 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///         + If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of ::urCommandBufferAppendKernelLaunchExp when this command was created.
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(