diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 2b18a4fb6e28f..eac772e4ce378 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -467,13 +467,14 @@ EventImplPtr queue_impl::submit_kernel_scheduler_bypass(
       BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName());
       assert(BinImage && "Failed to obtain a binary image.");
     }
-    enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(),
-                     KernelBundleImpPtr, KernelImplPtr,
-                     *KData.getDeviceKernelInfoPtr(), RawEvents,
-                     ResultEvent.get(), nullptr, KData.getKernelCacheConfig(),
-                     KData.isCooperative(), KData.usesClusterLaunch(),
-                     KData.getKernelWorkGroupMemorySize(), BinImage,
-                     KData.getKernelFuncPtr());
+    enqueueImpKernel(
+        *this, KData.getNDRDesc(), KData.getArgs(), KernelBundleImpPtr,
+        KernelImplPtr, *KData.getDeviceKernelInfoPtr(), RawEvents,
+        ResultEvent.get(), nullptr, KData.getKernelCacheConfig(),
+        KData.isCooperative(), KData.usesClusterLaunch(),
+        KData.getKernelWorkGroupMemorySize(), BinImage,
+        KData.getKernelFuncPtr(), KData.getKernelNumArgs(),
+        KData.getKernelParamDescGetter(), KData.hasSpecialCaptures());
 #ifdef XPTI_ENABLE_INSTRUMENTATION
     if (xptiEnabled) {
       // Emit signal only when event is created
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index b38706b9d9d37..d451c1e035576 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2303,14 +2303,14 @@ ur_mem_flags_t AccessModeToUr(access::mode AccessorMode) {
   }
 }
 
-// Sets arguments for a given kernel and device based on the argument type.
-// Refactored from SetKernelParamsAndLaunch to allow it to be used in the graphs
-// extension.
-static void SetArgBasedOnType(
-    adapter_impl &Adapter, ur_kernel_handle_t Kernel,
+// Gets UR argument struct for a given kernel and device based on the argument
+// type. Refactored from SetKernelParamsAndLaunch to allow it to be used in
+// the graphs extension (LaunchWithArgs for graphs is planned future work).
+static void GetUrArgsBasedOnType(
     device_image_impl *DeviceImageImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex) {
+    context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex,
+    std::vector<ur_exp_kernel_arg_properties_t> &UrArgs) {
   switch (Arg.MType) {
   case kernel_param_kind_t::kind_dynamic_work_group_memory:
     break;
@@ -2330,52 +2330,61 @@ static void SetArgBasedOnType(
         getMemAllocationFunc
             ? reinterpret_cast<ur_mem_handle_t>(getMemAllocationFunc(Req))
             : nullptr;
-    ur_kernel_arg_mem_obj_properties_t MemObjData{};
-    MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
-    MemObjData.memoryAccess = AccessModeToUr(Req->MAccessMode);
-    Adapter.call<UrApiKind::urKernelSetArgMemObj>(Kernel, NextTrueIndex,
-                                                  &MemObjData, MemArg);
+    ur_exp_kernel_arg_value_t Value = {};
+    Value.memObjTuple = {MemArg, AccessModeToUr(Req->MAccessMode)};
+    UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                      UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ,
+                      static_cast<uint32_t>(NextTrueIndex), sizeof(MemArg),
+                      Value});
     break;
   }
   case kernel_param_kind_t::kind_std_layout: {
+    ur_exp_kernel_arg_type_t Type;
     if (Arg.MPtr) {
-      Adapter.call<UrApiKind::urKernelSetArgValue>(
-          Kernel, NextTrueIndex, Arg.MSize, nullptr, Arg.MPtr);
+      Type = UR_EXP_KERNEL_ARG_TYPE_VALUE;
     } else {
-      Adapter.call<UrApiKind::urKernelSetArgLocal>(Kernel, NextTrueIndex,
-                                                   Arg.MSize, nullptr);
+      Type = UR_EXP_KERNEL_ARG_TYPE_LOCAL;
     }
+    ur_exp_kernel_arg_value_t Value = {};
+    Value.value = {Arg.MPtr};
+    UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                      Type, static_cast<uint32_t>(NextTrueIndex),
+                      static_cast<size_t>(Arg.MSize), Value});
 
     break;
   }
   case kernel_param_kind_t::kind_sampler: {
     sampler *SamplerPtr = (sampler *)Arg.MPtr;
-    ur_sampler_handle_t Sampler =
-        (ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr)
-            ->getOrCreateSampler(ContextImpl);
-    Adapter.call<UrApiKind::urKernelSetArgSampler>(Kernel, NextTrueIndex,
-                                                   nullptr, Sampler);
+    ur_exp_kernel_arg_value_t Value = {};
+    Value.sampler = (ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr)
+                        ->getOrCreateSampler(ContextImpl);
+    UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                      UR_EXP_KERNEL_ARG_TYPE_SAMPLER,
+                      static_cast<uint32_t>(NextTrueIndex),
+                      sizeof(ur_sampler_handle_t), Value});
     break;
   }
   case kernel_param_kind_t::kind_pointer: {
-    // We need to de-rerence this to get the actual USM allocation - that's the
+    ur_exp_kernel_arg_value_t Value = {};
+    // We need to de-rerence to get the actual USM allocation - that's the
     // pointer UR is expecting.
-    const void *Ptr = *static_cast<const void *const *>(Arg.MPtr);
-    Adapter.call<UrApiKind::urKernelSetArgPointer>(Kernel, NextTrueIndex,
-                                                   nullptr, Ptr);
+    Value.pointer = *static_cast<void *const *>(Arg.MPtr);
+    UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                      UR_EXP_KERNEL_ARG_TYPE_POINTER,
+                      static_cast<uint32_t>(NextTrueIndex), sizeof(Arg.MPtr),
+                      Value});
     break;
   }
   case kernel_param_kind_t::kind_specialization_constants_buffer: {
     assert(DeviceImageImpl != nullptr);
     ur_mem_handle_t SpecConstsBuffer =
         DeviceImageImpl->get_spec_const_buffer_ref();
-
-    ur_kernel_arg_mem_obj_properties_t MemObjProps{};
-    MemObjProps.pNext = nullptr;
-    MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
-    MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY;
-    Adapter.call<UrApiKind::urKernelSetArgMemObj>(
-        Kernel, NextTrueIndex, &MemObjProps, SpecConstsBuffer);
+    ur_exp_kernel_arg_value_t Value = {};
+    Value.memObjTuple = {SpecConstsBuffer, UR_MEM_FLAG_READ_ONLY};
+    UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                      UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ,
+                      static_cast<uint32_t>(NextTrueIndex),
+                      sizeof(SpecConstsBuffer), Value});
     break;
   }
   case kernel_param_kind_t::kind_invalid:
@@ -2394,58 +2403,75 @@ static ur_result_t SetKernelParamsAndLaunch(
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     bool IsCooperative, bool KernelUsesClusterLaunch,
     uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage,
-    DeviceKernelInfo &DeviceKernelInfo, void *KernelFuncPtr = nullptr) {
+    KernelNameStrRefT KernelName, void *KernelFuncPtr = nullptr,
+    int KernelNumArgs = 0,
+    detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr,
+    bool KernelHasSpecialCaptures = true) {
   adapter_impl &Adapter = Queue.getAdapter();
 
   if (SYCLConfig<SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
     std::vector<unsigned char> Empty;
     Kernel = Scheduler::getInstance().completeSpecConstMaterialization(
-        Queue, BinImage, DeviceKernelInfo.Name,
+        Queue, BinImage, KernelName,
         DeviceImageImpl ? DeviceImageImpl->get_spec_const_blob_ref() : Empty);
   }
 
-  if (KernelFuncPtr && !DeviceKernelInfo.HasSpecialCaptures) {
-    auto setFunc = [&Adapter, Kernel,
+  std::vector<ur_exp_kernel_arg_properties_t> UrArgs;
+  UrArgs.reserve(Args.size());
+
+  if (KernelFuncPtr && !KernelHasSpecialCaptures) {
+    auto setFunc = [&UrArgs,
                     KernelFuncPtr](const detail::kernel_param_desc_t &ParamDesc,
                                    size_t NextTrueIndex) {
       const void *ArgPtr = (const char *)KernelFuncPtr + ParamDesc.offset;
       switch (ParamDesc.kind) {
       case kernel_param_kind_t::kind_std_layout: {
         int Size = ParamDesc.info;
-        Adapter.call<UrApiKind::urKernelSetArgValue>(Kernel, NextTrueIndex,
-                                                     Size, nullptr, ArgPtr);
+        ur_exp_kernel_arg_value_t Value = {};
+        Value.value = ArgPtr;
+        UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                          UR_EXP_KERNEL_ARG_TYPE_VALUE,
+                          static_cast<uint32_t>(NextTrueIndex),
+                          static_cast<size_t>(Size), Value});
         break;
       }
       case kernel_param_kind_t::kind_pointer: {
-        const void *Ptr = *static_cast<const void *const *>(ArgPtr);
-        Adapter.call<UrApiKind::urKernelSetArgPointer>(Kernel, NextTrueIndex,
-                                                       nullptr, Ptr);
+        ur_exp_kernel_arg_value_t Value = {};
+        Value.pointer = *static_cast<const void *const *>(ArgPtr);
+        UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                          UR_EXP_KERNEL_ARG_TYPE_POINTER,
+                          static_cast<uint32_t>(NextTrueIndex),
+                          sizeof(Value.pointer), Value});
         break;
       }
       default:
         throw std::runtime_error("Direct kernel argument copy failed.");
       }
     };
-    applyFuncOnFilteredArgs(EliminatedArgMask, DeviceKernelInfo.NumParams,
-                            DeviceKernelInfo.ParamDescGetter, setFunc);
+    applyFuncOnFilteredArgs(EliminatedArgMask, KernelNumArgs,
+                            KernelParamDescGetter, setFunc);
   } else {
-    auto setFunc = [&Adapter, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
-                    &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
-      SetArgBasedOnType(Adapter, Kernel, DeviceImageImpl, getMemAllocationFunc,
-                        Queue.getContextImpl(), Arg, NextTrueIndex);
+    auto setFunc = [&DeviceImageImpl, &getMemAllocationFunc, &Queue,
+                    &UrArgs](detail::ArgDesc &Arg, size_t NextTrueIndex) {
+      GetUrArgsBasedOnType(DeviceImageImpl, getMemAllocationFunc,
+                           Queue.getContextImpl(), Arg, NextTrueIndex, UrArgs);
     };
     applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
   }
 
-  const std::optional<int> &ImplicitLocalArg =
-      DeviceKernelInfo.getImplicitLocalArgPos();
+  std::optional<int> ImplicitLocalArg =
+      ProgramManager::getInstance().kernelImplicitLocalArgPos(KernelName);
   // Set the implicit local memory buffer to support
   // get_work_group_scratch_memory. This is for backend not supporting
   // CUDA-style local memory setting. Note that we may have -1 as a position,
   // this indicates the buffer is actually unused and was elided.
   if (ImplicitLocalArg.has_value() && ImplicitLocalArg.value() != -1) {
-    Adapter.call<UrApiKind::urKernelSetArgLocal>(
-        Kernel, ImplicitLocalArg.value(), WorkGroupMemorySize, nullptr);
+    UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES,
+                      nullptr,
+                      UR_EXP_KERNEL_ARG_TYPE_LOCAL,
+                      static_cast<uint32_t>(ImplicitLocalArg.value()),
+                      WorkGroupMemorySize,
+                      {nullptr}});
   }
 
   adjustNDRangePerKernel(NDRDesc, Kernel, Queue.getDeviceImpl());
@@ -2468,16 +2494,14 @@ static ur_result_t SetKernelParamsAndLaunch(
         /* pPropSizeRet = */ nullptr);
 
     const bool EnforcedLocalSize =
-        (RequiredWGSize[0] != 0 &&
-         (NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) &&
-         (NDRDesc.Dims < 3 || RequiredWGSize[2] != 0));
+        (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
+         RequiredWGSize[2] != 0);
     if (EnforcedLocalSize)
       LocalSize = RequiredWGSize;
   }
-
-  const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 &&
-                         (NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) &&
-                         (NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0);
+  const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 ||
+                         NDRDesc.GlobalOffset[1] != 0 ||
+                         NDRDesc.GlobalOffset[2] != 0;
 
   std::vector<ur_kernel_launch_property_t> property_list;
 
@@ -2505,13 +2529,15 @@ static ur_result_t SetKernelParamsAndLaunch(
                              {{WorkGroupMemorySize}}});
   }
   ur_event_handle_t UREvent = nullptr;
-  ur_result_t Error = Adapter.call_nocheck<UrApiKind::urEnqueueKernelLaunch>(
-      Queue.getHandleRef(), Kernel, NDRDesc.Dims,
-      HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0],
-      LocalSize, property_list.size(),
-      property_list.empty() ? nullptr : property_list.data(), RawEvents.size(),
-      RawEvents.empty() ? nullptr : &RawEvents[0],
-      OutEventImpl ? &UREvent : nullptr);
+  ur_result_t Error =
+      Adapter.call_nocheck<UrApiKind::urEnqueueKernelLaunchWithArgsExp>(
+          Queue.getHandleRef(), Kernel, NDRDesc.Dims,
+          HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr,
+          &NDRDesc.GlobalSize[0], LocalSize, UrArgs.size(), UrArgs.data(),
+          property_list.size(),
+          property_list.empty() ? nullptr : property_list.data(),
+          RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
+          OutEventImpl ? &UREvent : nullptr);
   if (Error == UR_RESULT_SUCCESS && OutEventImpl) {
     OutEventImpl->setHandle(UREvent);
   }
@@ -2519,6 +2545,88 @@ static ur_result_t SetKernelParamsAndLaunch(
   return Error;
 }
 
+// Sets arguments for a given kernel and device based on the argument type.
+// This is a legacy path which the graphs extension still uses.
+static void SetArgBasedOnType(
+    adapter_impl &Adapter, ur_kernel_handle_t Kernel,
+    device_image_impl *DeviceImageImpl,
+    const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
+    context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex) {
+  switch (Arg.MType) {
+  case kernel_param_kind_t::kind_dynamic_work_group_memory:
+    break;
+  case kernel_param_kind_t::kind_work_group_memory:
+    break;
+  case kernel_param_kind_t::kind_stream:
+    break;
+  case kernel_param_kind_t::kind_dynamic_accessor:
+  case kernel_param_kind_t::kind_accessor: {
+    Requirement *Req = (Requirement *)(Arg.MPtr);
+
+    // getMemAllocationFunc is nullptr when there are no requirements. However,
+    // we may pass default constructed accessors to a command, which don't add
+    // requirements. In such case, getMemAllocationFunc is nullptr, but it's a
+    // valid case, so we need to properly handle it.
+    ur_mem_handle_t MemArg =
+        getMemAllocationFunc
+            ? reinterpret_cast<ur_mem_handle_t>(getMemAllocationFunc(Req))
+            : nullptr;
+    ur_kernel_arg_mem_obj_properties_t MemObjData{};
+    MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
+    MemObjData.memoryAccess = AccessModeToUr(Req->MAccessMode);
+    Adapter.call<UrApiKind::urKernelSetArgMemObj>(Kernel, NextTrueIndex,
+                                                  &MemObjData, MemArg);
+    break;
+  }
+  case kernel_param_kind_t::kind_std_layout: {
+    if (Arg.MPtr) {
+      Adapter.call<UrApiKind::urKernelSetArgValue>(
+          Kernel, NextTrueIndex, Arg.MSize, nullptr, Arg.MPtr);
+    } else {
+      Adapter.call<UrApiKind::urKernelSetArgLocal>(Kernel, NextTrueIndex,
+                                                   Arg.MSize, nullptr);
+    }
+
+    break;
+  }
+  case kernel_param_kind_t::kind_sampler: {
+    sampler *SamplerPtr = (sampler *)Arg.MPtr;
+    ur_sampler_handle_t Sampler =
+        (ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr)
+            ->getOrCreateSampler(ContextImpl);
+    Adapter.call<UrApiKind::urKernelSetArgSampler>(Kernel, NextTrueIndex,
+                                                   nullptr, Sampler);
+    break;
+  }
+  case kernel_param_kind_t::kind_pointer: {
+    // We need to de-rerence this to get the actual USM allocation - that's the
+    // pointer UR is expecting.
+    const void *Ptr = *static_cast<const void *const *>(Arg.MPtr);
+    Adapter.call<UrApiKind::urKernelSetArgPointer>(Kernel, NextTrueIndex,
+                                                   nullptr, Ptr);
+    break;
+  }
+  case kernel_param_kind_t::kind_specialization_constants_buffer: {
+    assert(DeviceImageImpl != nullptr);
+    ur_mem_handle_t SpecConstsBuffer =
+        DeviceImageImpl->get_spec_const_buffer_ref();
+
+    ur_kernel_arg_mem_obj_properties_t MemObjProps{};
+    MemObjProps.pNext = nullptr;
+    MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
+    MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY;
+    Adapter.call<UrApiKind::urKernelSetArgMemObj>(
+        Kernel, NextTrueIndex, &MemObjProps, SpecConstsBuffer);
+    break;
+  }
+  case kernel_param_kind_t::kind_invalid:
+    throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+                          "Invalid kernel param kind " +
+                              codeToString(UR_RESULT_ERROR_INVALID_VALUE));
+    break;
+  }
+}
+
 static std::tuple<ur_kernel_handle_t, device_image_impl *,
                   const KernelArgMask *>
 getCGKernelInfo(const CGExecKernel &CommandGroup, context_impl &ContextImpl,
@@ -2668,7 +2776,9 @@ void enqueueImpKernel(
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative,
     const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize,
-    const RTDeviceBinaryImage *BinImage, void *KernelFuncPtr) {
+    const RTDeviceBinaryImage *BinImage, void *KernelFuncPtr, int KernelNumArgs,
+    detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
+    bool KernelHasSpecialCaptures) {
   // Run OpenCL kernel
   context_impl &ContextImpl = Queue.getContextImpl();
   device_impl &DeviceImpl = Queue.getDeviceImpl();
@@ -2755,7 +2865,8 @@ void enqueueImpKernel(
         Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
         OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
         KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize,
-        BinImage, DeviceKernelInfo, KernelFuncPtr);
+        BinImage, DeviceKernelInfo.Name, KernelFuncPtr, KernelNumArgs,
+        KernelParamDescGetter, KernelHasSpecialCaptures);
   }
   if (UR_RESULT_SUCCESS != Error) {
     // If we have got non-success error code, let's analyze it to emit nice
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index d47a5d9d9131f..7c9600eb4a5c5 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -633,7 +633,9 @@ void enqueueImpKernel(
     ur_kernel_cache_config_t KernelCacheConfig, bool KernelIsCooperative,
     const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize,
     const RTDeviceBinaryImage *BinImage = nullptr,
-    void *KernelFuncPtr = nullptr);
+    void *KernelFuncPtr = nullptr, int KernelNumArgs = 0,
+    detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr,
+    bool KernelHasSpecialCaptures = true);
 
 /// The exec CG command enqueues execution of kernel or explicit memory
 /// operation.
diff --git a/sycl/test-e2e/Adapters/level_zero/batch_barrier.cpp b/sycl/test-e2e/Adapters/level_zero/batch_barrier.cpp
index 9b4a05391e3ca..6ede03dca7a84 100644
--- a/sycl/test-e2e/Adapters/level_zero/batch_barrier.cpp
+++ b/sycl/test-e2e/Adapters/level_zero/batch_barrier.cpp
@@ -24,7 +24,7 @@ int main(int argc, char *argv[]) {
   queue q;
 
   submit_kernel(q); // starts a batch
-                    // CHECK: ---> urEnqueueKernelLaunch
+                    // CHECK: ---> urEnqueueKernelLaunchWithArgsExp
                     // CHECK-NOT: zeCommandQueueExecuteCommandLists
 
   // Initializing Level Zero driver is required if this test is linked
@@ -42,7 +42,7 @@ int main(int argc, char *argv[]) {
   // CHECK-NOT: zeCommandQueueExecuteCommandLists
 
   submit_kernel(q);
-  // CHECK: ---> urEnqueueKernelLaunch
+  // CHECK: ---> urEnqueueKernelLaunchWithArgsExp
   // CHECK-NOT: zeCommandQueueExecuteCommandLists
 
   // interop should close the batch
diff --git a/sycl/test-e2e/Adapters/level_zero/batch_test.cpp b/sycl/test-e2e/Adapters/level_zero/batch_test.cpp
index 8f6e4e0f6a563..1db60a36e1265 100644
--- a/sycl/test-e2e/Adapters/level_zero/batch_test.cpp
+++ b/sycl/test-e2e/Adapters/level_zero/batch_test.cpp
@@ -55,55 +55,54 @@
 // variable SYCL_PI_LEVEL_ZEOR+BATCH_SIZE=N.
 // This test enqueues 8 kernels and then does a wait. And it does this 3 times.
 // Expected output is that for batching =1 you will see zeCommandListClose,
-// and zeCommandQueueExecuteCommandLists after every urEnqueueKernelLaunch.
-// For batching=3 you will see that after 3rd and 6th enqueues, and then after
-// urQueueFinish. For 5, after 5th urEnqueue, and then after urQueueFinish.  For
-// 4 you will see these after 4th and 8th Enqueue, and for 8, only after the
-// 8th enqueue.  And lastly for 9, you will see the Close and Execute calls
-// only after the urQueueFinish.
-// Since the test does this 3 times, this pattern will repeat 2 more times,
-// and then the test will print Test Passed 8 times, once for each kernel
-// validation check.
+// and zeCommandQueueExecuteCommandLists after every
+// urEnqueueKernelLaunchWithArgsExp. For batching=3 you will see that after 3rd
+// and 6th enqueues, and then after urQueueFinish. For 5, after 5th urEnqueue,
+// and then after urQueueFinish.  For 4 you will see these after 4th and 8th
+// Enqueue, and for 8, only after the 8th enqueue.  And lastly for 9, you will
+// see the Close and Execute calls only after the urQueueFinish. Since the test
+// does this 3 times, this pattern will repeat 2 more times, and then the test
+// will print Test Passed 8 times, once for each kernel validation check.
 // Pattern starts first set of kernel executions.
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB4:  zeCommandListClose(
 // CKB4:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB5:  zeCommandListClose(
 // CKB5:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB7:  zeCommandListClose(
 // CKB7:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
@@ -121,45 +120,45 @@
 // CKB9:  zeCommandListClose(
 // CKB9:  zeCommandQueueExecuteCommandLists(
 // Pattern starts 2nd set of kernel executions
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB4:  zeCommandListClose(
 // CKB4:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB5:  zeCommandListClose(
 // CKB5:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB7:  zeCommandListClose(
 // CKB7:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
@@ -177,45 +176,45 @@
 // CKB9:  zeCommandListClose(
 // CKB9:  zeCommandQueueExecuteCommandLists(
 // Pattern starts 3rd set of kernel executions
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB4:  zeCommandListClose(
 // CKB4:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB5:  zeCommandListClose(
 // CKB5:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB7:  zeCommandListClose(
 // CKB7:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
diff --git a/sycl/test-e2e/Adapters/level_zero/batch_test_copy_with_compute.cpp b/sycl/test-e2e/Adapters/level_zero/batch_test_copy_with_compute.cpp
index 8470c7abd2bfa..85634616c67a9 100644
--- a/sycl/test-e2e/Adapters/level_zero/batch_test_copy_with_compute.cpp
+++ b/sycl/test-e2e/Adapters/level_zero/batch_test_copy_with_compute.cpp
@@ -31,55 +31,53 @@
 // variable SYCL_PI_LEVEL_ZERO_{COPY_}BATCH_SIZE=N.
 // This test enqueues 8 kernels and then does a wait. And it does this 3 times.
 // Expected output is that for batching =1 you will see zeCommandListClose,
-// and zeCommandQueueExecuteCommandLists after every urEnqueueKernelLaunch.
-// For batching=3 you will see that after 3rd and 6th enqueues, and then after
-// urEventWait. For 5, after 5th urEnqueue, and then after urEventWait.  For
-// 4 you will see these after 4th and 8th Enqueue, and for 8, only after the
-// 8th enqueue.  And lastly for 9, you will see the Close and Execute calls
-// only after the urEventWait.
-// Since the test does this 3 times, this pattern will repeat 2 more times,
-// and then the test will print Test Passed 8 times, once for each kernel
-// validation check.
-// Pattern starts first set of kernel executions.
-// CKALL: ---> urEnqueueKernelLaunch
+// and zeCommandQueueExecuteCommandLists after every
+// urEnqueueKernelLaunchWithArgsExp. For batching=3 you will see that after 3rd
+// and 6th enqueues, and then after urEventWait. For 5, after 5th urEnqueue, and
+// then after urEventWait.  For 4 you will see these after 4th and 8th Enqueue,
+// and for 8, only after the 8th enqueue.  And lastly for 9, you will see the
+// Close and Execute calls only after the urEventWait. Since the test does this
+// 3 times, this pattern will repeat 2 more times, and then the test will print
+// Test Passed 8 times, once for each kernel validation check. Pattern starts
+// first set of kernel executions. CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB4:  zeCommandListClose(
 // CKB4:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB5:  zeCommandListClose(
 // CKB5:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB7:  zeCommandListClose(
 // CKB7:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
@@ -97,45 +95,45 @@
 // CKB9:  zeCommandListClose(
 // CKB9:  zeCommandQueueExecuteCommandLists(
 // Pattern starts 2nd set of kernel executions
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB4:  zeCommandListClose(
 // CKB4:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB5:  zeCommandListClose(
 // CKB5:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB7:  zeCommandListClose(
 // CKB7:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
@@ -153,45 +151,45 @@
 // CKB9:  zeCommandListClose(
 // CKB9:  zeCommandQueueExecuteCommandLists(
 // Pattern starts 3rd set of kernel executions
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB4:  zeCommandListClose(
 // CKB4:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB5:  zeCommandListClose(
 // CKB5:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB3:  zeCommandListClose(
 // CKB3:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
 // CKB7:  zeCommandListClose(
 // CKB7:  zeCommandQueueExecuteCommandLists(
-// CKALL: ---> urEnqueueKernelLaunch
+// CKALL: ---> urEnqueueKernelLaunchWithArgsExp
 // CKALL: zeCommandListAppendLaunchKernel(
 // CKB1:  zeCommandListClose(
 // CKB1:  zeCommandQueueExecuteCommandLists(
diff --git a/sycl/test-e2e/Basic/fill_accessor_ur.cpp b/sycl/test-e2e/Basic/fill_accessor_ur.cpp
index 29a991ae40ad3..69395f8b4053b 100644
--- a/sycl/test-e2e/Basic/fill_accessor_ur.cpp
+++ b/sycl/test-e2e/Basic/fill_accessor_ur.cpp
@@ -63,7 +63,7 @@ void testFill_Buffer2D() {
       auto acc2D =
           buffer_2D.get_access<sycl::access::mode::write>(cgh, {8, 12}, {2, 2});
       // "ranged accessor" will have to be handled by custom kernel:
-      // urEnqueueKernelLaunch
+      // urEnqueueKernelLaunchWithArgsExp
       cgh.fill(acc2D, float{4});
     });
     q.wait();
@@ -94,7 +94,7 @@ void testFill_Buffer3D() {
       auto acc3D = buffer_3D.get_access<sycl::access::mode::write>(
           cgh, {4, 8, 12}, {3, 3, 3});
       // "ranged accessor" will have to be handled by custom kernel:
-      // urEnqueueKernelLaunch
+      // urEnqueueKernelLaunchWithArgsExp
       cgh.fill(acc3D, float{6});
     });
     q.wait();
@@ -139,12 +139,12 @@ int main() {
 // CHECK: start testFill_Buffer2D
 // CHECK: <--- urEnqueueMemBufferFill
 // CHECK: start testFill_Buffer2D -- OFFSET
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 
 // CHECK: start testFill_Buffer3D
 // CHECK: <--- urEnqueueMemBufferFill
 // CHECK: start testFill_Buffer3D -- OFFSET
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 
 // CHECK: start testFill_ZeroDim
 // CHECK: <--- urEnqueueMemBufferFill
diff --git a/sycl/test-e2e/Basic/host-task-dependency.cpp b/sycl/test-e2e/Basic/host-task-dependency.cpp
index 3b015051377c9..2c29034de4af8 100644
--- a/sycl/test-e2e/Basic/host-task-dependency.cpp
+++ b/sycl/test-e2e/Basic/host-task-dependency.cpp
@@ -179,13 +179,13 @@ int main() {
 // launch of Gen kernel
 // CHECK: <--- urKernelCreate
 // CHECK: NameGen
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 // prepare for host task
 // CHECK: <--- urEnqueueMemBuffer{{Map|Read}}
 // launch of Copier kernel
 // CHECK: <--- urKernelCreate
 // CHECK: Copier
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 
 // CHECK:Third buffer [  0] = 0
 // CHECK:Third buffer [  1] = 1
diff --git a/sycl/test-e2e/Basic/kernel_bundle/kernel_bundle_api.cpp b/sycl/test-e2e/Basic/kernel_bundle/kernel_bundle_api.cpp
index 70f29387c866c..7d87d2ed13e55 100644
--- a/sycl/test-e2e/Basic/kernel_bundle/kernel_bundle_api.cpp
+++ b/sycl/test-e2e/Basic/kernel_bundle/kernel_bundle_api.cpp
@@ -230,7 +230,7 @@ int main() {
     // CHECK-SAME: .hKernel = [[KERNEL_HANDLE]]
     // CHECK-SAME:-> UR_RESULT_SUCCESS;
     //
-    // CHECK:<--- urEnqueueKernelLaunch(
+    // CHECK:<--- urEnqueueKernelLaunchWithArgsExp(
     // CHECK-SAME: .hKernel = [[KERNEL_HANDLE]]
     //
     // CHECK:<--- urKernelRelease(
diff --git a/sycl/test-e2e/Basic/queue/release.cpp b/sycl/test-e2e/Basic/queue/release.cpp
index 13ee5d6ee22bf..5f4dc58c92575 100644
--- a/sycl/test-e2e/Basic/queue/release.cpp
+++ b/sycl/test-e2e/Basic/queue/release.cpp
@@ -11,7 +11,7 @@ int main() {
   return 0;
 }
 
-// CHECK: <--- urEnqueueKernelLaunch(
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp(
 // FIXME the order of these 2 varies between adapters due to a Level Zero
 // specific queue workaround.
 // CHECK-DAG: <--- urEventRelease(
diff --git a/sycl/test-e2e/Basic/subdevice_pi.cpp b/sycl/test-e2e/Basic/subdevice_pi.cpp
index 4843c86a899b3..43595063abc1e 100644
--- a/sycl/test-e2e/Basic/subdevice_pi.cpp
+++ b/sycl/test-e2e/Basic/subdevice_pi.cpp
@@ -67,7 +67,7 @@ static bool check_separate(device dev, buffer<int, 1> buf,
   // CHECK-SEPARATE: <--- urContextCreate
   // CHECK-SEPARATE: <--- urQueueCreate
   // CHECK-SEPARATE: <--- urMemBufferCreate
-  // CHECK-SEPARATE: <--- urEnqueueKernelLaunch
+  // CHECK-SEPARATE: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-SEPARATE: <--- urQueueFinish
 
   log_pi("Test sub device 1");
@@ -84,7 +84,7 @@ static bool check_separate(device dev, buffer<int, 1> buf,
   // CHECK-SEPARATE: <--- urEnqueueMemBuffer{{Map|Read}}
   // CHECK-SEPARATE: <--- urEnqueueMemBufferWrite
   //
-  // CHECK-SEPARATE: <--- urEnqueueKernelLaunch
+  // CHECK-SEPARATE: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-SEPARATE: <--- urQueueFinish
 
   return true;
@@ -119,7 +119,7 @@ static bool check_shared_context(device dev, buffer<int, 1> buf,
   // Make sure that a single buffer is created (and shared between subdevices):
   // see --implicit-check-not above.
   //
-  // CHECK-SHARED: <--- urEnqueueKernelLaunch
+  // CHECK-SHARED: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-SHARED: <--- urQueueFinish
 
   log_pi("Test sub device 1");
@@ -129,7 +129,7 @@ static bool check_shared_context(device dev, buffer<int, 1> buf,
   }
   // CHECK-SHARED: Test sub device 1
   // CHECK-SHARED: <--- urQueueCreate
-  // CHECK-SHARED: <--- urEnqueueKernelLaunch
+  // CHECK-SHARED: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-SHARED: <--- urQueueFinish
   // CHECK-SHARED: <--- urEnqueueMemBufferRead
 
@@ -168,7 +168,7 @@ static bool check_fused_context(device dev, buffer<int, 1> buf,
   // Make sure that a single buffer is created (and shared between subdevices
   // *and* the root device): see --implicit-check-not above.
   //
-  // CHECK-FUSED: <--- urEnqueueKernelLaunch
+  // CHECK-FUSED: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-FUSED: <--- urQueueFinish
 
   log_pi("Test sub device 0");
@@ -178,7 +178,7 @@ static bool check_fused_context(device dev, buffer<int, 1> buf,
   }
   // CHECK-FUSED: Test sub device 0
   // CHECK-FUSED: <--- urQueueCreate
-  // CHECK-FUSED: <--- urEnqueueKernelLaunch
+  // CHECK-FUSED: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-FUSED: <--- urQueueFinish
 
   log_pi("Test sub device 1");
@@ -188,7 +188,7 @@ static bool check_fused_context(device dev, buffer<int, 1> buf,
   }
   // CHECK-FUSED: Test sub device 1
   // CHECK-FUSED: <--- urQueueCreate
-  // CHECK-FUSED: <--- urEnqueueKernelLaunch
+  // CHECK-FUSED: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-FUSED: <--- urQueueFinish
   // CHECK-FUSED: <--- urEnqueueMemBufferRead
 
diff --git a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp
index 7928e5da66bac..50a488d861874 100644
--- a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp
+++ b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp
@@ -1,5 +1,5 @@
 // Checks whether or not event Dependencies are honored by
-// urEnqueueKernelLaunch with cluster dimensions
+// urEnqueueKernelLaunchWithArgsExp with cluster dimensions
 // REQUIRES: target-nvidia, aspect-ext_oneapi_cuda_cluster_group
 // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_accessors.cpp b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_accessors.cpp
index 5cdbd0f73e683..fffd5d22625f7 100644
--- a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_accessors.cpp
+++ b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_accessors.cpp
@@ -3,12 +3,12 @@
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
 //
 // The test checks that the last parameter is not `nullptr` for
-// urEnqueueKernelLaunch for a kernel using buffer accessor.
+// urEnqueueKernelLaunchWithArgsExp for a kernel using buffer accessor.
 //
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 //
-// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}}.phEvent = nullptr
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}}.phEvent = nullptr
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 // CHECK: -> UR_RESULT_SUCCESS
 //
 // CHECK: The test passed.
diff --git a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_using_assert.cpp b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_using_assert.cpp
index 5d965b1c27ff1..df9bb56b6d280 100644
--- a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_using_assert.cpp
+++ b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_using_assert.cpp
@@ -6,10 +6,10 @@
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
 //
 // The test checks that the last parameter is not `nullptr` for
-// urEnqueueKernelLaunch.
+// urEnqueueKernelLaunchWithArgsExp.
 //
-// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}}.phEvent = nullptr
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}}.phEvent = nullptr
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 // CHECK: -> UR_RESULT_SUCCESS
 //
 // CHECK: The test passed.
diff --git a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_usm_ooo_queue.cpp b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_usm_ooo_queue.cpp
index 3aa91ed17cd32..79ba3d79f5800 100644
--- a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_usm_ooo_queue.cpp
+++ b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_usm_ooo_queue.cpp
@@ -1,7 +1,7 @@
 // RUN: %{build} -o %t.out
 //
-// On level_zero Q.fill uses urEnqueueKernelLaunch and not urEnqueueUSMFill
-// due to https://github.com/intel/llvm/issues/13787
+// On level_zero Q.fill uses urEnqueueKernelLaunchWithArgsExp and not
+// urEnqueueUSMFill due to https://github.com/intel/llvm/issues/13787
 //
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt --check-prefixes=CHECK%if level_zero %{,CHECK-L0%} %else %{,CHECK-OTHER%}
 //
@@ -25,7 +25,7 @@
 // CHECK: -> UR_RESULT_SUCCESS
 //
 // Level-zero backend doesn't use urEnqueueUSMFill
-// CHECK-L0: <--- urEnqueueKernelLaunch
+// CHECK-L0: <--- urEnqueueKernelLaunchWithArgsExp
 // CHECK-L0: .phEvent = {{[0-9a-f]+}}
 // CHECK-OTHER: <--- urEnqueueUSMFill({{.*}} .phEvent = {{[0-9a-f]+}}
 // CHECK: -> UR_RESULT_SUCCESS
@@ -47,12 +47,12 @@
 // CHECK: <--- urEnqueueEventsWaitWithBarrier
 // CHECK: -> UR_RESULT_SUCCESS
 //
-// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}} .phEvent = nullptr
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}} .phEvent = nullptr
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 // CHECK: -> UR_RESULT_SUCCESS
 //
-// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}} .phEvent = nullptr
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}} .phEvent = nullptr
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 // CHECK: -> UR_RESULT_SUCCESS
 //
 // RegularQueue
@@ -74,7 +74,7 @@
 // CHECK: -> UR_RESULT_SUCCESS
 //
 // Level-zero backend doesn't use urEnqueueUSMFill
-// CHECK-L0: <--- urEnqueueKernelLaunch
+// CHECK-L0: <--- urEnqueueKernelLaunchWithArgsExp
 // CHECK-L0: .phEvent = {{[0-9a-f]+}}
 // CHECK-OTHER: <--- urEnqueueUSMFill({{.*}} .phEvent = {{[0-9a-f]+}}
 // CHECK: -> UR_RESULT_SUCCESS
@@ -96,12 +96,12 @@
 // CHECK: <--- urEnqueueEventsWaitWithBarrier
 // CHECK: -> UR_RESULT_SUCCESS
 //
-// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}} .phEvent = nullptr
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}} .phEvent = nullptr
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 // CHECK: -> UR_RESULT_SUCCESS
 //
-// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}} .phEvent = nullptr
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}} .phEvent = nullptr
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 // CHECK: -> UR_RESULT_SUCCESS
 //
 // RegularQueue
diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
index 3f8482f4bc3df..7d283372e2ae6 100644
--- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
+++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp
@@ -33,7 +33,7 @@ int main() {
   // CHECK-NOT: <--- urProgramRetain
   // CHECK: <--- urKernelCreate
   // CHECK-NOT: <--- urKernelRetain
-  // CHECK: <--- urEnqueueKernelLaunch
+  // CHECK: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK: <--- urProgramRelease
   // CHECK: <--- urKernelRelease
   // CHECK: <--- urEventWait
@@ -45,7 +45,7 @@ int main() {
   // CHECK-CACHE: <--- urKernelCreate
   // CHECK-CACHE: <--- urKernelRetain
   // CHECK-CACHE-NOT: <--- urKernelCreate
-  // CHECK-CACHE: <--- urEnqueueKernelLaunch
+  // CHECK-CACHE: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-CACHE-NOT: <--- urProgramRelease
   // CHECK-CACHE: <--- urEventWait
   // iteration 1:
@@ -59,7 +59,7 @@ int main() {
   // CHECK-NOT: <--- urProgramRetain
   // CHECK: <--- urKernelCreate
   // CHECK-NOT: <--- urKernelRetain
-  // CHECK: <--- urEnqueueKernelLaunch
+  // CHECK: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK: <--- urKernelRelease
   // CHECK: <--- urProgramRelease
   // CHECK: <--- urEventWait
@@ -70,7 +70,7 @@ int main() {
   // CHECK-CACHE: <--- urKernelCreate
   // CHECK-CACHE: <--- urKernelRetain
   // CHECK-CACHE-NOT: <--- urKernelCreate
-  // CHECK-CACHE: <--- urEnqueueKernelLaunch
+  // CHECK-CACHE: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-CACHE: <--- urKernelRelease
   // CHECK-CACHE: <--- urProgramRelease
   // CHECK-CACHE: <--- urEventWait
@@ -79,7 +79,7 @@ int main() {
   // CHECK-NOT: <--- urProgramRetain
   // CHECK: <--- urKernelCreate
   // CHECK-NOT: <--- urKernelRetain
-  // CHECK: <--- urEnqueueKernelLaunch
+  // CHECK: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK: <--- urKernelRelease
   // CHECK: <--- urProgramRelease
   // CHECK: <--- urEventWait
@@ -90,7 +90,7 @@ int main() {
   // CHECK-CACHE: <--- urKernelCreate
   // CHECK-CACHE: <--- urKernelRetain
   // CHECK-CACHE-NOT: <--- urKernelCreate
-  // CHECK-CACHE: <--- urEnqueueKernelLaunch
+  // CHECK-CACHE: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-CACHE: <--- urKernelRelease
   // CHECK-CACHE: <--- urProgramRelease
   // CHECK-CACHE: <--- urEventWait
diff --git a/sycl/test-e2e/Scheduler/HostAccDestruction.cpp b/sycl/test-e2e/Scheduler/HostAccDestruction.cpp
index fd9465935dfe1..30e98974b5f1d 100644
--- a/sycl/test-e2e/Scheduler/HostAccDestruction.cpp
+++ b/sycl/test-e2e/Scheduler/HostAccDestruction.cpp
@@ -32,5 +32,5 @@ int main() {
 }
 
 // CHECK:host acc destructor call
-// CHECK: <--- urEnqueueKernelLaunch
+// CHECK: <--- urEnqueueKernelLaunchWithArgsExp
 // CHECK:end of scope
diff --git a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp
index 0c0e1750805f2..df8062d165ccb 100644
--- a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp
+++ b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp
@@ -36,15 +36,15 @@ int main() {
 
   // Sequential submissions to the same in-order queue should not result in any
   // event dependencies.
-  // CHECK: <--- urEnqueueKernelLaunch
+  // CHECK: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-SAME: .numEventsInWaitList = 0
   submitKernel(InOrderQueueA, Buf);
-  // CHECK: <--- urEnqueueKernelLaunch
+  // CHECK: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-SAME: .numEventsInWaitList = 0
   submitKernel(InOrderQueueA, Buf);
   // Submisssion to a different in-order queue should explicitly depend on the
   // previous command group.
-  // CHECK: <--- urEnqueueKernelLaunch
+  // CHECK: <--- urEnqueueKernelLaunchWithArgsExp
   // CHECK-SAME: .numEventsInWaitList = 1
   submitKernel(InOrderQueueB, Buf);
 
diff --git a/sycl/test-e2e/SpecConstants/2020/image_selection.cpp b/sycl/test-e2e/SpecConstants/2020/image_selection.cpp
index d473098defeb1..bc2f5901ec8c4 100644
--- a/sycl/test-e2e/SpecConstants/2020/image_selection.cpp
+++ b/sycl/test-e2e/SpecConstants/2020/image_selection.cpp
@@ -65,40 +65,40 @@ int main() {
   // submission depending on whether spec const value was set or not. a. In the
   // case when we select image where specialization constants are replaced with
   // default value - specialization constant buffer is not created and we set
-  // nullptr in urKernelSetArgMemObj (4th parameter) b. In the case when we
-  // select regular image - specialization constant buffer is created and we set
-  // a real pointer in urKernelSetArgMemObj.
+  // nullptr in urEnqueueKernelLaunchWithArgsExp. In the case when we select
+  // regular image - specialization constant buffer is created and we set a
+  // real pointer in urEnqueueKernelLaunchWithArgsExp.
 
   // CHECK-DEFAULT: Submission 0
-  // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS;
+  // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-DEFAULT: Default value of specialization constant was used.
 
   // CHECK-DEFAULT: Submission 1
-  // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS;
+  // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-DEFAULT: New specialization constant value was set.
 
   // CHECK-DEFAULT: Submission 2
-  // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS;
+  // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-DEFAULT: Default value of specialization constant was used.
 
   // CHECK-DEFAULT: Submission 3
-  // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS;
+  // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-DEFAULT: New specialization constant value was set.
 
   // CHECK-ENABLED: Submission 0
-  // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS;
+  // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-ENABLED: Default value of specialization constant was used.
 
   // CHECK-ENABLED: Submission 1
-  // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS;
+  // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}}-> UR_RESULT_SUCCESS;
   // CHECK-ENABLED: New specialization constant value was set.
 
   // CHECK-ENABLED: Submission 2
-  // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS;
+  // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-ENABLED: Default value of specialization constant was used.
 
   // CHECK-ENABLED: Submission 3
-  // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS;
+  // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-ENABLED: New specialization constant value was set.
 
   // CHECK-MIX: Submission 0
@@ -138,11 +138,11 @@ int main() {
   // default, that's why nullptr is set as 4th parameter of
   // urKernelSetArgMemObj.
   // CHECK-DEFAULT: Kernel bundle
-  // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS;
+  // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-DEFAULT: Default value of specialization constant was used.
 
   // CHECK-ENABLED: Kernel bundle
-  // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS;
+  // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-ENABLED: Default value of specialization constant was used.
 
   // CHECK-MIX: Kernel bundle
@@ -170,7 +170,7 @@ int main() {
   // constants. We are verifying that by checking the 4th parameter is set to
   // zero.
   // CHECK-DEFAULT-EXPLICIT-SET: Default value was explicitly set
-  // CHECK-DEFAULT-EXPLICIT-SET: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS;
+  // CHECK-DEFAULT-EXPLICIT-SET: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-DEFAULT-EXPLICIT-SET: Default value of specialization constant was used.
   std::cout << "Default value was explicitly set" << std::endl;
   Q.submit([&](sycl::handler &cgh) {
@@ -193,7 +193,7 @@ int main() {
   // values of specialization constants. We are verifying that by checking the
   // 4th parameter is set to zero.
   // CHECK-DEFAULT-BACK-TO-DEFAULT: Changed to new value and then default value was explicitly set
-  // CHECK-DEFAULT-BACK-TO-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS;
+  // CHECK-DEFAULT-BACK-TO-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS;
   // CHECK-DEFAULT-BACK-TO-DEFAULT: Default value of specialization constant was used.
   std::cout << "Changed to new value and then default value was explicitly set"
             << std::endl;
diff --git a/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp b/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp
index 8faa2f2678356..28f071237ffa0 100644
--- a/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp
+++ b/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp
@@ -22,6 +22,13 @@
 // CHECK-NEXT: UR Call Begin : urPlatformGetInfo
 // CHECK-NEXT: UR Call Begin : urKernelSetExecInfo
 // CHECK-NEXT: UR Call Begin : urKernelRetain
+// CHECK:      UR Call Begin : urKernelGetGroupInfo
+// CHECK-NEXT: UR Call Begin : urEnqueueKernelLaunchWithArgsExp
+// CHECK:      UR Call Begin : urKernelCreate
+// CHECK-NEXT: UR Call Begin : urPlatformGetInfo
+// CHECK-NEXT: UR Call Begin : urPlatformGetInfo
+// CHECK-NEXT: UR Call Begin : urKernelSetExecInfo
+// CHECK-NEXT: UR Call Begin : urKernelRetain
 // CHECK:      Node create
 // CHECK-DAG:    queue_id : {{.*}}
 // CHECK-DAG:    sym_line_no : {{.*}}
@@ -38,9 +45,8 @@
 // CHECK-DAG:    from_source : false
 // CHECK-DAG:    kernel_name : typeinfo name for main::{lambda(sycl::_V1::handler&)#1}::operator()(sycl::_V1::handler&) const::{lambda()#1}
 // CHECK-DAG:    sycl_device : {{.*}}
-// CHECK:      UR Call Begin : urKernelSetArgPointer
-// CHECK-NEXT: UR Call Begin : urKernelGetGroupInfo
-// CHECK-NEXT: UR Call Begin : urEnqueueKernelLaunch
+// CHECK:      UR Call Begin : urKernelGetGroupInfo
+// CHECK-NEXT: UR Call Begin : urEnqueueKernelLaunchWithArgsExp
 // CHECK-NEXT: Signal
 // CHECK-DAG:    queue_id : {{.*}}
 // CHECK-DAG:    sym_line_no : {{.*}}
diff --git a/sycl/tools/xpti_helpers/usm_analyzer.hpp b/sycl/tools/xpti_helpers/usm_analyzer.hpp
index 6df1c522899d9..0dc46427557bb 100644
--- a/sycl/tools/xpti_helpers/usm_analyzer.hpp
+++ b/sycl/tools/xpti_helpers/usm_analyzer.hpp
@@ -254,6 +254,11 @@ class USMAnalyzer {
       handleKernelSetArgPointer(
           static_cast<ur_kernel_set_arg_pointer_params_t *>(Data->args_data));
       return;
+    case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP:
+      handleEnqueueKernelLaunchWithArgsExp(
+          static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(
+              Data->args_data));
+      return;
     default:
       return;
     }
@@ -421,4 +426,18 @@ class USMAnalyzer {
         "kernel parameter with index = " + std::to_string(*Params->pargIndex),
         Ptr, 0 /*no data how it will be used in kernel*/, "kernel");
   }
+
+  static void handleEnqueueKernelLaunchWithArgsExp(
+      const ur_enqueue_kernel_launch_with_args_exp_params_t *Params) {
+    // Search for pointer args and validate the pointers
+    for (uint32_t i = 0; i < *Params->pnumArgs; i++) {
+      if ((*Params->ppArgs)[i].type == UR_EXP_KERNEL_ARG_TYPE_POINTER) {
+        void *Ptr = (const_cast<void *>((*Params->ppArgs)[i].value.pointer));
+        CheckPointerValidness("kernel parameter with index = " +
+                                  std::to_string((*Params->ppArgs)[i].index),
+                              Ptr, 0 /*no data how it will be used in kernel*/,
+                              "kernel");
+      }
+    }
+  }
 };
diff --git a/sycl/unittests/Extensions/DeviceGlobal.cpp b/sycl/unittests/Extensions/DeviceGlobal.cpp
index b6fbc9bd8ab3e..6aa8c32405830 100644
--- a/sycl/unittests/Extensions/DeviceGlobal.cpp
+++ b/sycl/unittests/Extensions/DeviceGlobal.cpp
@@ -191,8 +191,9 @@ ur_result_t after_urEventGetInfo(void *pParams) {
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t after_urEnqueueKernelLaunch(void *pParams) {
-  auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
+ur_result_t after_urEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
   ++KernelCallCounter;
   EXPECT_TRUE(DeviceGlobalInitEvent.has_value())
       << "DeviceGlobalInitEvent has not been set. Kernel call "
@@ -275,7 +276,7 @@ TEST_F(DeviceGlobalTest, DeviceGlobalInitBeforeUse) {
   REDEFINE_AFTER(urEnqueueUSMMemcpy);
   REDEFINE_AFTER_TEMPLATED(urEnqueueDeviceGlobalVariableWrite, true);
   REDEFINE_AFTER(urEventGetInfo);
-  REDEFINE_AFTER(urEnqueueKernelLaunch);
+  REDEFINE_AFTER(urEnqueueKernelLaunchWithArgsExp);
 
   // Kernel call 1.
   // First launch should create both init events.
diff --git a/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp b/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp
index a1014adbff686..9e6366ce16abf 100644
--- a/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp
+++ b/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp
@@ -13,6 +13,7 @@
 #include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
 
 using namespace sycl;
+using namespace FreeFunctionEventsHelpers;
 
 namespace oneapiext = ext::oneapi::experimental;
 
@@ -26,7 +27,7 @@ class EnqueueFunctionsEventsTests : public ::testing::Test {
 
 protected:
   void SetUp() override {
-    counter_urEnqueueKernelLaunch = 0;
+    counter_urEnqueueKernelLaunchWithArgsExp = 0;
     counter_urUSMEnqueueMemcpy = 0;
     counter_urUSMEnqueueFill = 0;
     counter_urUSMEnqueuePrefetch = 0;
@@ -39,28 +40,31 @@ class EnqueueFunctionsEventsTests : public ::testing::Test {
 };
 
 TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   oneapiext::submit(Q, [&](handler &CGH) {
     oneapiext::single_task<TestKernel>(CGH, []() {});
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   oneapiext::single_task<TestKernel>(Q, []() {});
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -74,12 +78,13 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskKernelNoEvent) {
   oneapiext::submit(Q,
                     [&](handler &CGH) { oneapiext::single_task(CGH, Kernel); });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -93,32 +98,35 @@ TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutKernelNoEvent) {
 
   oneapiext::single_task(Q, Kernel);
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   oneapiext::submit(Q, [&](handler &CGH) {
     oneapiext::parallel_for<TestKernel>(CGH, range<1>{32}, [](item<1>) {});
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   oneapiext::parallel_for<TestKernel>(Q, range<1>{32}, [](item<1>) {});
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -133,12 +141,13 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForKernelNoEvent) {
     oneapiext::parallel_for(CGH, range<1>{32}, Kernel);
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -152,34 +161,37 @@ TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutKernelNoEvent) {
 
   oneapiext::parallel_for(Q, range<1>{32}, Kernel);
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   oneapiext::submit(Q, [&](handler &CGH) {
     oneapiext::nd_launch<TestKernel>(
         CGH, nd_range<1>{range<1>{32}, range<1>{32}}, [](nd_item<1>) {});
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   oneapiext::nd_launch<TestKernel>(Q, nd_range<1>{range<1>{32}, range<1>{32}},
                                    [](nd_item<1>) {});
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -194,12 +206,13 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchKernelNoEvent) {
     oneapiext::nd_launch(CGH, nd_range<1>{range<1>{32}, range<1>{32}}, Kernel);
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -213,7 +226,7 @@ TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutKernelNoEvent) {
 
   oneapiext::nd_launch(Q, nd_range<1>{range<1>{32}, range<1>{32}}, Kernel);
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(EnqueueFunctionsEventsTests, SubmitMemcpyNoEvent) {
@@ -375,8 +388,9 @@ TEST_F(EnqueueFunctionsEventsTests, MemAdviseShortcutNoEvent) {
 TEST_F(EnqueueFunctionsEventsTests, BarrierBeforeHostTask) {
   // Special test for case where host_task need an event after, so a barrier is
   // enqueued to create a usable event.
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback(
       "urEnqueueEventsWaitWithBarrier", &after_urEnqueueEventsWaitWithBarrier);
 
@@ -388,7 +402,7 @@ TEST_F(EnqueueFunctionsEventsTests, BarrierBeforeHostTask) {
          [&]() { HostTaskTimestamp = std::chrono::steady_clock::now(); });
    }).wait();
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
   ASSERT_EQ(counter_urEnqueueEventsWaitWithBarrier, size_t{1});
   ASSERT_TRUE(HostTaskTimestamp > timestamp_urEnqueueEventsWaitWithBarrier);
 }
diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp
index be5302a1c12a4..d451e51f99652 100644
--- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp
+++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp
@@ -15,6 +15,8 @@
 #define __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS
 #include <sycl/khr/free_function_commands.hpp>
 
+using namespace FreeFunctionEventsHelpers;
+
 class TestFunctor {
 public:
   void operator()() const {}
@@ -77,7 +79,7 @@ class FreeFunctionCommandsEventsTests : public ::testing::Test {
 
 protected:
   void SetUp() override {
-    counter_urEnqueueKernelLaunch = 0;
+    counter_urEnqueueKernelLaunchWithArgsExp = 0;
     counter_urUSMEnqueueMemcpy = 0;
     counter_urUSMEnqueueFill = 0;
     counter_urUSMEnqueuePrefetch = 0;
@@ -90,26 +92,29 @@ class FreeFunctionCommandsEventsTests : public ::testing::Test {
 };
 
 TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchTaskNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   sycl::khr::submit(Queue, [&](sycl::handler &Handler) {
     sycl::khr::launch_task(Handler, TestFunctor());
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, LaunchTaskShortcutNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   sycl::khr::launch_task(Queue, TestFunctor());
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchTaskKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
   auto KID = sycl::get_kernel_id<TestFunctor>();
@@ -123,12 +128,13 @@ TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchTaskKernelNoEvent) {
     sycl::khr::launch_task(Handler, Kernel);
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, LaunchTaskShortcutKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -142,33 +148,36 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchTaskShortcutKernelNoEvent) {
 
   sycl::khr::launch_task(Queue, Kernel);
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchForNoEvent) {
 
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   sycl::khr::submit(Queue, [&](sycl::handler &Handler) {
     sycl::khr::launch(Handler, sycl::range<1>{32}, TestFunctor());
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, LaunchForShortcutNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   sycl::khr::launch(Queue, sycl::range<1>{32}, TestFunctor());
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchForKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -183,12 +192,13 @@ TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchForKernelNoEvent) {
     sycl::khr::launch(Handler, sycl::range<1>{32}, Kernel);
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, LaunchForShortcutKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -202,35 +212,38 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchForShortcutKernelNoEvent) {
 
   sycl::khr::launch(Queue, sycl::range<1>{32}, Kernel);
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   sycl::khr::submit(Queue, [&](sycl::handler &Handler) {
     sycl::khr::launch_grouped(Handler, sycl::range<1>{32}, sycl::range<1>{32},
                               TestFunctor());
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32},
                             TestFunctor());
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests,
        LaunchGroupedShortcutMoveKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   TestMoveFunctor::MoveCtorCalls = 0;
   TestMoveFunctor MoveOnly;
@@ -245,7 +258,7 @@ TEST_F(FreeFunctionCommandsEventsTests,
                             std::move(MoveOnly));
 
   ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 0);
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 
   // Another kernel submission is queued behind a host task,
   // to force the scheduler-based submission. In this case, the HostKernel
@@ -254,7 +267,8 @@ TEST_F(FreeFunctionCommandsEventsTests,
   // Replace the callback with an event based one, since the scheduler
   // needs to create an event internally
   mock::getCallbacks().set_replace_callback(
-      "urEnqueueKernelLaunch", &redefined_urEnqueueKernelLaunchWithEvent);
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithEvent);
 
   Queue.submit([&](sycl::handler &CGH) {
     CGH.host_task([&] {
@@ -282,8 +296,9 @@ TEST_F(FreeFunctionCommandsEventsTests,
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -299,12 +314,13 @@ TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) {
                               Kernel);
   });
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutKernelNoEvent) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback("urKernelGetInfo",
                                           &after_urKernelGetInfo);
 
@@ -319,7 +335,7 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutKernelNoEvent) {
   sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32},
                             Kernel);
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
 }
 
 TEST_F(FreeFunctionCommandsEventsTests, SubmitMemcpyNoEvent) {
@@ -482,8 +498,9 @@ TEST_F(FreeFunctionCommandsEventsTests, MemAdviseShortcutNoEvent) {
 TEST_F(FreeFunctionCommandsEventsTests, BarrierBeforeHostTask) {
   // Special test for case where host_task need an event after, so a barrier is
   // enqueued to create a usable event.
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefined_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback(
       "urEnqueueEventsWaitWithBarrier", &after_urEnqueueEventsWaitWithBarrier);
 
@@ -497,7 +514,7 @@ TEST_F(FreeFunctionCommandsEventsTests, BarrierBeforeHostTask) {
       })
       .wait();
 
-  ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
+  ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1});
   ASSERT_EQ(counter_urEnqueueEventsWaitWithBarrier, size_t{1});
   ASSERT_TRUE(HostTaskTimestamp > timestamp_urEnqueueEventsWaitWithBarrier);
 }
diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp
index 4e57f82002654..a87dda42021d1 100644
--- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp
+++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp
@@ -9,6 +9,8 @@
 #include <sycl/properties/all_properties.hpp>
 #include <sycl/usm.hpp>
 
+namespace FreeFunctionEventsHelpers {
+
 inline ur_result_t after_urKernelGetInfo(void *pParams) {
   auto params = *static_cast<ur_kernel_get_info_params_t *>(pParams);
   constexpr char MockKernel[] = "TestKernel";
@@ -23,10 +25,11 @@ inline ur_result_t after_urKernelGetInfo(void *pParams) {
   return UR_RESULT_SUCCESS;
 }
 
-static thread_local size_t counter_urEnqueueKernelLaunch = 0;
-inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) {
-  ++counter_urEnqueueKernelLaunch;
-  auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
+static thread_local size_t counter_urEnqueueKernelLaunchWithArgsExp = 0;
+inline ur_result_t redefined_urEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  ++counter_urEnqueueKernelLaunchWithArgsExp;
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
   EXPECT_EQ(*params.pphEvent, nullptr);
   return UR_RESULT_SUCCESS;
 }
@@ -34,7 +37,8 @@ inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) {
 static thread_local size_t counter_urEnqueueKernelLaunchWithEvent = 0;
 inline ur_result_t redefined_urEnqueueKernelLaunchWithEvent(void *pParams) {
   ++counter_urEnqueueKernelLaunchWithEvent;
-  auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
   EXPECT_NE(*params.pphEvent, nullptr);
   return UR_RESULT_SUCCESS;
 }
@@ -79,3 +83,5 @@ inline ur_result_t after_urEnqueueEventsWaitWithBarrier(void *pParams) {
   timestamp_urEnqueueEventsWaitWithBarrier = std::chrono::steady_clock::now();
   return UR_RESULT_SUCCESS;
 }
+
+} // namespace FreeFunctionEventsHelpers
diff --git a/sycl/unittests/Extensions/USMMemcpy2D.cpp b/sycl/unittests/Extensions/USMMemcpy2D.cpp
index e05164d2ac66d..bf8b5df0c4793 100644
--- a/sycl/unittests/Extensions/USMMemcpy2D.cpp
+++ b/sycl/unittests/Extensions/USMMemcpy2D.cpp
@@ -264,8 +264,9 @@ ur_result_t after_urKernelCreate(void *pParams) {
 
 std::string LastEnqueuedKernel;
 
-ur_result_t after_urEnqueueKernelLaunch(void *pParams) {
-  auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
+ur_result_t after_urEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
   auto KernelIt = KernelToNameMap.find(*params.phKernel);
   EXPECT_TRUE(KernelIt != KernelToNameMap.end());
   LastEnqueuedKernel = KernelIt->second;
@@ -347,8 +348,9 @@ TEST(USMMemcpy2DTest, USMMemops2DUnsupported) {
                                           &after_urDeviceGetInfo);
   mock::getCallbacks().set_after_callback("urKernelCreate",
                                           &after_urKernelCreate);
-  mock::getCallbacks().set_after_callback("urEnqueueKernelLaunch",
-                                          &after_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_after_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &after_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback(
       "urUSMGetMemAllocInfo", &after_urUSMGetMemAllocInfo<UR_USM_TYPE_DEVICE>);
 
@@ -388,8 +390,9 @@ TEST(USMMemcpy2DTest, USMFillSupportedOnly) {
                                           &after_urDeviceGetInfo);
   mock::getCallbacks().set_after_callback("urKernelCreate",
                                           &after_urKernelCreate);
-  mock::getCallbacks().set_after_callback("urEnqueueKernelLaunch",
-                                          &after_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_after_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &after_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_replace_callback("urEnqueueUSMFill2D",
                                             &redefine_urEnqueueUSMFill2D);
   mock::getCallbacks().set_after_callback(
@@ -435,8 +438,9 @@ TEST(USMMemcpy2DTest, USMMemsetSupportedOnly) {
                                           &after_urDeviceGetInfo);
   mock::getCallbacks().set_after_callback("urKernelCreate",
                                           &after_urKernelCreate);
-  mock::getCallbacks().set_after_callback("urEnqueueKernelLaunch",
-                                          &after_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_after_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &after_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_after_callback(
       "urUSMGetMemAllocInfo", &after_urUSMGetMemAllocInfo<UR_USM_TYPE_DEVICE>);
   mock::getCallbacks().set_replace_callback("urEnqueueUSMFill2D",
@@ -480,8 +484,9 @@ TEST(USMMemcpy2DTest, USMMemcpySupportedOnly) {
                                           &after_urDeviceGetInfo);
   mock::getCallbacks().set_after_callback("urKernelCreate",
                                           &after_urKernelCreate);
-  mock::getCallbacks().set_after_callback("urEnqueueKernelLaunch",
-                                          &after_urEnqueueKernelLaunch);
+  mock::getCallbacks().set_after_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &after_urEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_replace_callback("urEnqueueUSMMemcpy2D",
                                             &redefine_urEnqueueUSMMemcpy2D);
   mock::getCallbacks().set_after_callback(
diff --git a/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp b/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp
index 97159ba3a278d..3abf2f7eca47b 100644
--- a/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp
+++ b/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp
@@ -46,16 +46,24 @@ static sycl::unittest::MockDeviceImage Img =
     sycl::unittest::generateDefaultImage({"WorkGroupMemoryKernel"});
 static sycl::unittest::MockDeviceImageArray<1> ImgArray{&Img};
 
-static int urKernelSetArgLocalCalls = 0;
-inline ur_result_t redefined_urKernelSetArgLocal(void *) {
-  ++urKernelSetArgLocalCalls;
+static int LocalMemArgs = 0;
+inline ur_result_t redefined_urEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
+  auto Args = *params.ppArgs;
+  for (uint32_t i = 0; i < *params.pnumArgs; i++) {
+    if (Args[i].type == UR_EXP_KERNEL_ARG_TYPE_LOCAL) {
+      ++LocalMemArgs;
+    }
+  }
   return UR_RESULT_SUCCESS;
 }
 
 TEST(URArgumentTest, URArgumentTest) {
   sycl::unittest::UrMock<> Mock;
-  mock::getCallbacks().set_replace_callback("urKernelSetArgLocal",
-                                            &redefined_urKernelSetArgLocal);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
   sycl::platform Platform = sycl::platform();
   const sycl::device dev = Platform.get_devices()[0];
   sycl::queue q{dev};
@@ -66,5 +74,5 @@ TEST(URArgumentTest, URArgumentTest) {
                                               kernel);
   });
   q.wait();
-  ASSERT_EQ(urKernelSetArgLocalCalls, 1);
+  ASSERT_EQ(LocalMemArgs, 1);
 }
diff --git a/sycl/unittests/buffer/KernelArgMemObj.cpp b/sycl/unittests/buffer/KernelArgMemObj.cpp
index b826e89a128b8..136f5b73843c4 100644
--- a/sycl/unittests/buffer/KernelArgMemObj.cpp
+++ b/sycl/unittests/buffer/KernelArgMemObj.cpp
@@ -42,13 +42,21 @@ static sycl::unittest::MockDeviceImageArray<1> ImgArray{&Img};
 using namespace sycl;
 
 bool PropertyPresent = false;
-ur_kernel_arg_mem_obj_properties_t PropsCopy{};
-
-ur_result_t redefinedKernelSetArgMemObj(void *pParams) {
-  auto params = *static_cast<ur_kernel_set_arg_mem_obj_params_t *>(pParams);
-  PropertyPresent = *params.ppProperties != nullptr;
-  if (PropertyPresent)
-    PropsCopy = **params.ppProperties;
+ur_mem_flags_t MemFlags{};
+
+ur_result_t redefinedEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
+  auto Args = *params.ppArgs;
+  for (uint32_t i = 0; i < *params.pnumArgs; i++) {
+    if (Args[i].type != UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ) {
+      continue;
+    }
+    PropertyPresent = Args[i].value.memObjTuple.flags != 0;
+    if (PropertyPresent) {
+      MemFlags = Args[i].value.memObjTuple.flags;
+    }
+  }
   return UR_RESULT_SUCCESS;
 }
 
@@ -59,9 +67,10 @@ class BuferTestUrArgs : public ::testing::Test {
 protected:
   void SetUp() override {
     PropertyPresent = false;
-    PropsCopy = {};
-    mock::getCallbacks().set_before_callback("urKernelSetArgMemObj",
-                                             &redefinedKernelSetArgMemObj);
+    MemFlags = 0;
+    mock::getCallbacks().set_before_callback(
+        "urEnqueueKernelLaunchWithArgsExp",
+        &redefinedEnqueueKernelLaunchWithArgsExp);
   }
 
   template <sycl::access::mode AccessMode>
@@ -80,7 +89,7 @@ class BuferTestUrArgs : public ::testing::Test {
         })
         .wait();
     ASSERT_TRUE(PropertyPresent);
-    EXPECT_EQ(PropsCopy.memoryAccess, ExpectedAccessMode);
+    EXPECT_EQ(MemFlags, ExpectedAccessMode);
   }
 
 protected:
diff --git a/sycl/unittests/handler/SetArgForLocalAccessor.cpp b/sycl/unittests/handler/SetArgForLocalAccessor.cpp
index 7a9079872ce36..aef26577bc50b 100644
--- a/sycl/unittests/handler/SetArgForLocalAccessor.cpp
+++ b/sycl/unittests/handler/SetArgForLocalAccessor.cpp
@@ -21,9 +21,15 @@ namespace {
 
 size_t LocalBufferArgSize = 0;
 
-ur_result_t redefined_urKernelSetArgLocal(void *pParams) {
-  auto params = *static_cast<ur_kernel_set_arg_local_params_t *>(pParams);
-  LocalBufferArgSize = *params.pargSize;
+ur_result_t redefined_urEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
+  auto Args = *params.ppArgs;
+  for (uint32_t i = 0; i < *params.pnumArgs; i++) {
+    if (Args[i].type == UR_EXP_KERNEL_ARG_TYPE_LOCAL) {
+      LocalBufferArgSize = Args[i].size;
+    }
+  }
 
   return UR_RESULT_SUCCESS;
 }
@@ -31,8 +37,9 @@ ur_result_t redefined_urKernelSetArgLocal(void *pParams) {
 TEST(HandlerSetArg, LocalAccessor) {
   sycl::unittest::UrMock<> Mock;
   redefineMockForKernelInterop(Mock);
-  mock::getCallbacks().set_replace_callback("urKernelSetArgLocal",
-                                            &redefined_urKernelSetArgLocal);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   constexpr size_t Size = 128;
   sycl::queue Q;
diff --git a/sycl/unittests/scheduler/FailedCommands.cpp b/sycl/unittests/scheduler/FailedCommands.cpp
index 8207ade72f0d0..9104c1d9e4a10 100644
--- a/sycl/unittests/scheduler/FailedCommands.cpp
+++ b/sycl/unittests/scheduler/FailedCommands.cpp
@@ -76,7 +76,7 @@ ur_result_t failingUrCall(void *) { return UR_RESULT_ERROR_UNKNOWN; }
 
 TEST_F(SchedulerTest, FailedKernelException) {
   unittest::UrMock<> Mock;
-  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch",
+  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunchWithArgsExp",
                                            &failingUrCall);
   RunWithFailedCommandsAndCheck(true, 0);
 }
@@ -94,7 +94,8 @@ ur_event_handle_t DummyEvent = mock::createDummyHandle<ur_event_handle_t>();
 
 inline ur_result_t failedEnqueueKernelLaunchWithDummy(void *pParams) {
   DummyEventReturned = true;
-  auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
   **params.pphEvent = DummyEvent;
   return UR_RESULT_ERROR_UNKNOWN;
 }
@@ -120,7 +121,7 @@ TEST(FailedCommandsTest, CheckUREventReleaseWithKernel) {
   DummyEventReleaseAttempt = false;
   DummyEventReturned = false;
   sycl::unittest::UrMock<> Mock;
-  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch",
+  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunchWithArgsExp",
                                            &failedEnqueueKernelLaunchWithDummy);
   mock::getCallbacks().set_before_callback("urEventRelease",
                                            &checkDummyInEventRelease);
diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
index 6467ff7a2809d..fa33c3a06d9bc 100644
--- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
@@ -129,8 +129,9 @@ TEST_P(SchedulerTest, InOrderQueueIsolatedDeps) {
 
 std::vector<size_t> KernelEventListSize;
 
-inline ur_result_t customEnqueueKernelLaunch(void *pParams) {
-  auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
+inline ur_result_t customEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
   KernelEventListSize.push_back(*params.pnumEventsInWaitList);
   return UR_RESULT_SUCCESS;
 }
@@ -139,8 +140,9 @@ TEST_P(SchedulerTest, TwoInOrderQueuesOnSameContext) {
   KernelEventListSize.clear();
   sycl::unittest::UrMock<> Mock;
   bool UseShortcutFunction = GetParam();
-  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch",
-                                           &customEnqueueKernelLaunch);
+  mock::getCallbacks().set_before_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &customEnqueueKernelLaunchWithArgsExp);
 
   sycl::platform Plt = sycl::platform();
 
@@ -166,8 +168,9 @@ TEST_P(SchedulerTest, InOrderQueueNoSchedulerPath) {
   KernelEventListSize.clear();
   sycl::unittest::UrMock<> Mock;
   bool UseShortcutFunction = GetParam();
-  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch",
-                                           &customEnqueueKernelLaunch);
+  mock::getCallbacks().set_before_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &customEnqueueKernelLaunchWithArgsExp);
 
   sycl::platform Plt = sycl::platform();
 
diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
index 7dae968013948..c79c6ecdbba51 100644
--- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
@@ -51,8 +51,9 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) {
 enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 };
 std::vector<std::pair<CommandType, size_t>> ExecutedCommands;
 
-inline ur_result_t customEnqueueKernelLaunch(void *pParams) {
-  auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
+inline ur_result_t customEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
   ExecutedCommands.push_back(
       {CommandType::KERNEL, *params.pnumEventsInWaitList});
   return UR_RESULT_SUCCESS;
@@ -68,8 +69,9 @@ inline ur_result_t customEnqueueUSMFill(void *pParams) {
 TEST_F(SchedulerTest, InOrderQueueCrossDeps) {
   ExecutedCommands.clear();
   sycl::unittest::UrMock<> Mock;
-  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch",
-                                           &customEnqueueKernelLaunch);
+  mock::getCallbacks().set_before_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &customEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_before_callback("urEnqueueUSMFill",
                                            &customEnqueueUSMFill);
 
@@ -121,8 +123,9 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) {
 TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
   ExecutedCommands.clear();
   sycl::unittest::UrMock<> Mock;
-  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch",
-                                           &customEnqueueKernelLaunch);
+  mock::getCallbacks().set_before_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &customEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_before_callback("urEnqueueUSMFill",
                                            &customEnqueueUSMFill);
 
@@ -166,8 +169,9 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
 TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
   ExecutedCommands.clear();
   sycl::unittest::UrMock<> Mock;
-  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch",
-                                           &customEnqueueKernelLaunch);
+  mock::getCallbacks().set_before_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &customEnqueueKernelLaunchWithArgsExp);
 
   sycl::platform Plt = sycl::platform();
 
diff --git a/sycl/unittests/scheduler/RequiredWGSize.cpp b/sycl/unittests/scheduler/RequiredWGSize.cpp
index 675036ec30081..851ee3ee32d38 100644
--- a/sycl/unittests/scheduler/RequiredWGSize.cpp
+++ b/sycl/unittests/scheduler/RequiredWGSize.cpp
@@ -37,8 +37,9 @@ static ur_result_t redefinedKernelGetGroupInfo(void *pParams) {
   return UR_RESULT_SUCCESS;
 }
 
-static ur_result_t redefinedEnqueueKernelLaunch(void *pParams) {
-  auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
+static ur_result_t redefinedEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
   if (*params.ppLocalWorkSize) {
     IncomingLocalSize[0] = (*params.ppLocalWorkSize)[0];
     IncomingLocalSize[1] = (*params.ppLocalWorkSize)[1];
@@ -56,8 +57,9 @@ static void reset() {
 static void performChecks() {
   sycl::unittest::UrMock<> Mock;
   sycl::platform Plt = sycl::platform();
-  mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch",
-                                           &redefinedEnqueueKernelLaunch);
+  mock::getCallbacks().set_before_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefinedEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_before_callback("urKernelGetGroupInfo",
                                            &redefinedKernelGetGroupInfo);
 
diff --git a/sycl/unittests/thread_safety/InteropKernelEnqueue.cpp b/sycl/unittests/thread_safety/InteropKernelEnqueue.cpp
index ca54cf0d908d6..935ca1eba18ac 100644
--- a/sycl/unittests/thread_safety/InteropKernelEnqueue.cpp
+++ b/sycl/unittests/thread_safety/InteropKernelEnqueue.cpp
@@ -24,23 +24,33 @@ constexpr std::size_t LaunchCount = 8;
 
 uint32_t LastArgSet = -1;
 std::size_t LastThread = -1;
-ur_result_t redefined_urKernelSetArgValue(void *pParams) {
-  auto params = *static_cast<ur_kernel_set_arg_value_params_t *>(pParams);
-  EXPECT_EQ((LastArgSet + 1) % NArgs, *params.pargIndex);
-  LastArgSet = *params.pargIndex;
-  std::size_t ArgValue = *static_cast<const std::size_t *>(*params.ppArgValue);
-  if (*params.pargIndex == 0)
-    LastThread = ArgValue;
-  else
-    EXPECT_EQ(LastThread, ArgValue);
+ur_result_t redefined_urEnqueueKernelLaunchWithArgsExp(void *pParams) {
+  auto params =
+      *static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
+  auto Args = *params.ppArgs;
+  for (uint32_t i = 0; i < *params.pnumArgs; i++) {
+    if (Args[i].type != UR_EXP_KERNEL_ARG_TYPE_VALUE) {
+      continue;
+    }
+    auto ArgIndex = Args[i].index;
+    EXPECT_EQ((LastArgSet + 1) % NArgs, ArgIndex);
+    LastArgSet = ArgIndex;
+    std::size_t ArgValue =
+        *static_cast<const std::size_t *>(Args[i].value.pointer);
+    if (ArgIndex == 0)
+      LastThread = ArgValue;
+    else
+      EXPECT_EQ(LastThread, ArgValue);
+  }
   return UR_RESULT_SUCCESS;
 }
 
 TEST(KernelEnqueue, InteropKernel) {
   unittest::UrMock<> Mock;
   redefineMockForKernelInterop(Mock);
-  mock::getCallbacks().set_replace_callback("urKernelSetArgValue",
-                                            &redefined_urKernelSetArgValue);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefined_urEnqueueKernelLaunchWithArgsExp);
 
   platform Plt = sycl::platform();
   queue Q;
diff --git a/sycl/unittests/xpti_trace/QueueApiFailures.cpp b/sycl/unittests/xpti_trace/QueueApiFailures.cpp
index 1e88143774e21..303567ddcf953 100644
--- a/sycl/unittests/xpti_trace/QueueApiFailures.cpp
+++ b/sycl/unittests/xpti_trace/QueueApiFailures.cpp
@@ -30,7 +30,7 @@ inline ur_result_t redefinedAdapterGetLastError(void *) {
   return UR_RESULT_ERROR_INVALID_VALUE;
 }
 
-ur_result_t redefinedEnqueueKernelLaunch(void *) {
+ur_result_t redefinedEnqueueKernelLaunchWithArgsExp(void *) {
   return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
 }
 
@@ -92,8 +92,9 @@ class QueueApiFailures : public ::testing::Test {
 };
 
 TEST_F(QueueApiFailures, QueueSubmit) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefinedEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefinedEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_replace_callback("urAdapterGetLastError",
                                             &redefinedAdapterGetLastError);
   sycl::queue Q;
@@ -116,8 +117,9 @@ TEST_F(QueueApiFailures, QueueSubmit) {
 }
 
 TEST_F(QueueApiFailures, QueueSingleTask) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefinedEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefinedEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_replace_callback("urAdapterGetLastError",
                                             &redefinedAdapterGetLastError);
   sycl::queue Q;
@@ -319,8 +321,9 @@ TEST_F(QueueApiFailures, QueueMemAdvise) {
 }
 
 TEST_F(QueueApiFailures, QueueParallelFor) {
-  mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
-                                            &redefinedEnqueueKernelLaunch);
+  mock::getCallbacks().set_replace_callback(
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefinedEnqueueKernelLaunchWithArgsExp);
   mock::getCallbacks().set_replace_callback("urAdapterGetLastError",
                                             &redefinedAdapterGetLastError);
   sycl::queue Q;
@@ -451,7 +454,8 @@ ur_result_t redefinedEnqueueKernelLaunchWithStatus(void *) {
 
 TEST_F(QueueApiFailures, QueueKernelAsync) {
   mock::getCallbacks().set_replace_callback(
-      "urEnqueueKernelLaunch", &redefinedEnqueueKernelLaunchWithStatus);
+      "urEnqueueKernelLaunchWithArgsExp",
+      &redefinedEnqueueKernelLaunchWithStatus);
   mock::getCallbacks().set_replace_callback("urAdapterGetLastError",
                                             &redefinedAdapterGetLastError);
 
diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h
index 906c32b837bbe..3d1136cfe2d1f 100644
--- a/unified-runtime/include/ur_api.h
+++ b/unified-runtime/include/ur_api.h
@@ -475,6 +475,8 @@ typedef enum ur_function_t {
   UR_FUNCTION_MEMORY_EXPORT_EXPORT_MEMORY_HANDLE_EXP = 287,
   /// Enumerator for ::urBindlessImagesSupportsImportingHandleTypeExp
   UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP = 288,
+  /// Enumerator for ::urEnqueueKernelLaunchWithArgsExp
+  UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP = 289,
   /// @cond
   UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
   /// @endcond
@@ -588,6 +590,8 @@ typedef enum ur_structure_type_t {
   UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES = 0x3000,
   /// ::ur_exp_enqueue_ext_properties_t
   UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES = 0x4000,
+  /// ::ur_exp_kernel_arg_properties_t
+  UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES = 0x5000,
   /// @cond
   UR_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
   /// @endcond
@@ -12895,6 +12899,166 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
     /// propName.
     size_t *pPropSizeRet);
 
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Unified Runtime Experimental API for setting args at kernel
+// launch
+#if !defined(__GNUC__)
+#pragma region enqueue_kernel_launch_with_args_(experimental)
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief What kind of kernel arg is this
+typedef enum ur_exp_kernel_arg_type_t {
+  /// Kernel arg is a value.
+  UR_EXP_KERNEL_ARG_TYPE_VALUE = 0,
+  /// Kernel arg is a pointer.
+  UR_EXP_KERNEL_ARG_TYPE_POINTER = 1,
+  /// Kernel arg is a memory object.
+  UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ = 2,
+  /// Kernel arg is a local allocation.
+  UR_EXP_KERNEL_ARG_TYPE_LOCAL = 3,
+  /// Kernel arg is a sampler.
+  UR_EXP_KERNEL_ARG_TYPE_SAMPLER = 4,
+  /// @cond
+  UR_EXP_KERNEL_ARG_TYPE_FORCE_UINT32 = 0x7fffffff
+  /// @endcond
+
+} ur_exp_kernel_arg_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mem obj/properties tuple
+typedef struct ur_exp_kernel_arg_mem_obj_tuple_t {
+  /// [in] Handle of a memory object
+  ur_mem_handle_t hMem;
+  /// [in] Memory flags to associate with `hMem`. Allowed values are:
+  /// ::UR_MEM_FLAG_READ_WRITE, ::UR_MEM_FLAG_WRITE_ONLY,
+  /// ::UR_MEM_FLAG_READ_ONLY.
+  ur_mem_flags_t flags;
+
+} ur_exp_kernel_arg_mem_obj_tuple_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Typesafe container for a kernel argument value
+typedef union ur_exp_kernel_arg_value_t {
+  /// [in] argument value represented as matching arg type.
+  /// The data pointed to will be copied and therefore can be reused on return.
+  const void *value;
+  /// [in] Allocation obtained by USM allocation or virtual memory mapping
+  /// operation, or pointer to a literal value.
+  const void *pointer;
+  /// [in] Struct containing a memory object and associated flags.
+  ur_exp_kernel_arg_mem_obj_tuple_t memObjTuple;
+  /// [in] Handle of a sampler object.
+  ur_sampler_handle_t sampler;
+
+} ur_exp_kernel_arg_value_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel arg properties
+typedef struct ur_exp_kernel_arg_properties_t {
+  /// [in] type of this structure, must be
+  /// ::UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES
+  ur_structure_type_t stype;
+  /// [in,out][optional] pointer to extension-specific structure
+  void *pNext;
+  /// [in] type of the kernel arg
+  ur_exp_kernel_arg_type_t type;
+  /// [in] index of the kernel arg
+  uint32_t index;
+  /// [in] size of the kernel arg
+  size_t size;
+  /// [in][tagged_by(type)] Union containing the argument value.
+  ur_exp_kernel_arg_value_t value;
+
+} ur_exp_kernel_arg_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command to execute a kernel
+///
+/// @remarks
+///   _Analogues_
+///     - **clEnqueueNDRangeKernel**
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///         + `NULL == hKernel`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkSize`
+///         + `launchPropList == NULL && numPropsInLaunchPropList > 0`
+///         + `pArgs == NULL && numArgs > 0`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type`
+///     - ::UR_RESULT_ERROR_INVALID_QUEUE
+///     - ::UR_RESULT_ERROR_INVALID_KERNEL
+///     - ::UR_RESULT_ERROR_INVALID_EVENT
+///     - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+///         + `phEventWaitList == NULL && numEventsInWaitList > 0`
+///         + `phEventWaitList != NULL && numEventsInWaitList == 0`
+///         + If event objects in phEventWaitList are not valid events.
+///     - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS
+///         + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR.
+///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
+///         + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 ||
+///         pGlobalWorkSize[2] == 0`
+///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
+///         + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] ==
+///         0 || pLocalWorkSize[2] == 0)`
+///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///     - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values
+///     have not been specified."
+///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
+///     - ::UR_RESULT_ERROR_INVALID_OPERATION
+///         + If any property in `launchPropList` isn't supported by the device.
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list
+    /// of launch properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution.
+    /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional][alloc] return an event object that identifies this
+    /// particular kernel execution instance. If phEventWaitList and phEvent
+    /// are not NULL, phEvent must not refer to an element of the
+    /// phEventWaitList array.
+    ur_event_handle_t *phEvent);
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -14497,6 +14661,26 @@ typedef struct ur_enqueue_write_host_pipe_params_t {
   ur_event_handle_t **pphEvent;
 } ur_enqueue_write_host_pipe_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urEnqueueKernelLaunchWithArgsExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_enqueue_kernel_launch_with_args_exp_params_t {
+  ur_queue_handle_t *phQueue;
+  ur_kernel_handle_t *phKernel;
+  uint32_t *pworkDim;
+  const size_t **ppGlobalWorkOffset;
+  const size_t **ppGlobalWorkSize;
+  const size_t **ppLocalWorkSize;
+  uint32_t *pnumArgs;
+  const ur_exp_kernel_arg_properties_t **ppArgs;
+  uint32_t *pnumPropsInLaunchPropList;
+  const ur_kernel_launch_property_t **plaunchPropList;
+  uint32_t *pnumEventsInWaitList;
+  const ur_event_handle_t **pphEventWaitList;
+  ur_event_handle_t **pphEvent;
+} ur_enqueue_kernel_launch_with_args_exp_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urEnqueueEventsWaitWithBarrierExt
 /// @details Each entry is a pointer to the parameter passed to the function;
diff --git a/unified-runtime/include/ur_api_funcs.def b/unified-runtime/include/ur_api_funcs.def
index f0c92445b9238..97092258a5a46 100644
--- a/unified-runtime/include/ur_api_funcs.def
+++ b/unified-runtime/include/ur_api_funcs.def
@@ -133,6 +133,7 @@ _UR_API(urEnqueueDeviceGlobalVariableRead)
 _UR_API(urEnqueueReadHostPipe)
 _UR_API(urEnqueueWriteHostPipe)
 _UR_API(urEnqueueEventsWaitWithBarrierExt)
+_UR_API(urEnqueueKernelLaunchWithArgsExp)
 _UR_API(urEnqueueUSMDeviceAllocExp)
 _UR_API(urEnqueueUSMSharedAllocExp)
 _UR_API(urEnqueueUSMHostAllocExp)
diff --git a/unified-runtime/include/ur_ddi.h b/unified-runtime/include/ur_ddi.h
index f59e15a9eb3cd..b0c95cd7509fc 100644
--- a/unified-runtime/include/ur_ddi.h
+++ b/unified-runtime/include/ur_ddi.h
@@ -1097,6 +1097,15 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
 typedef ur_result_t(UR_APICALL *ur_pfnGetEnqueueProcAddrTable_t)(
     ur_api_version_t, ur_enqueue_dditable_t *);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urEnqueueKernelLaunchWithArgsExp
+typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchWithArgsExp_t)(
+    ur_queue_handle_t, ur_kernel_handle_t, uint32_t, const size_t *,
+    const size_t *, const size_t *, uint32_t,
+    const ur_exp_kernel_arg_properties_t *, uint32_t,
+    const ur_kernel_launch_property_t *, uint32_t, const ur_event_handle_t *,
+    ur_event_handle_t *);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function-pointer for urEnqueueUSMDeviceAllocExp
 typedef ur_result_t(UR_APICALL *ur_pfnEnqueueUSMDeviceAllocExp_t)(
@@ -1147,6 +1156,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueNativeCommandExp_t)(
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Table of EnqueueExp functions pointers
 typedef struct ur_enqueue_exp_dditable_t {
+  ur_pfnEnqueueKernelLaunchWithArgsExp_t pfnKernelLaunchWithArgsExp;
   ur_pfnEnqueueUSMDeviceAllocExp_t pfnUSMDeviceAllocExp;
   ur_pfnEnqueueUSMSharedAllocExp_t pfnUSMSharedAllocExp;
   ur_pfnEnqueueUSMHostAllocExp_t pfnUSMHostAllocExp;
diff --git a/unified-runtime/include/ur_print.h b/unified-runtime/include/ur_print.h
index 7c4528ec9ea81..567960c90cfff 100644
--- a/unified-runtime/include/ur_print.h
+++ b/unified-runtime/include/ur_print.h
@@ -1425,6 +1425,36 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer,
                    const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_kernel_arg_type_t enum
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL
+urPrintExpKernelArgType(enum ur_exp_kernel_arg_type_t value, char *buffer,
+                        const size_t buff_size, size_t *out_size);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_kernel_arg_mem_obj_tuple_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintExpKernelArgMemObjTuple(
+    const struct ur_exp_kernel_arg_mem_obj_tuple_t params, char *buffer,
+    const size_t buff_size, size_t *out_size);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_exp_kernel_arg_properties_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintExpKernelArgProperties(
+    const struct ur_exp_kernel_arg_properties_t params, char *buffer,
+    const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_exp_enqueue_ext_flag_t enum
 /// @returns
@@ -2694,6 +2724,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams(
     const struct ur_enqueue_write_host_pipe_params_t *params, char *buffer,
     const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_enqueue_kernel_launch_with_args_exp_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueKernelLaunchWithArgsExpParams(
+    const struct ur_enqueue_kernel_launch_with_args_exp_params_t *params,
+    char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_enqueue_events_wait_with_barrier_ext_params_t struct
 /// @returns
diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp
index b3de6c166ca31..d8c98a667088b 100644
--- a/unified-runtime/include/ur_print.hpp
+++ b/unified-runtime/include/ur_print.hpp
@@ -50,6 +50,8 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t flag);
 template <typename T>
 inline ur_result_t printTagged(std::ostream &os, const void *ptr, T value,
                                size_t size);
+template <size_t size, typename T>
+inline ur_result_t printArray(std::ostream &os, const T *ptr);
 
 inline ur_result_t printStruct(std::ostream &os, const void *ptr);
 
@@ -263,6 +265,10 @@ template <>
 inline ur_result_t printTagged(std::ostream &os, const void *ptr,
                                ur_exp_peer_info_t value, size_t size);
 
+inline ur_result_t printUnion(std::ostream &os,
+                              const union ur_exp_kernel_arg_value_t params,
+                              const enum ur_exp_kernel_arg_type_t tag);
+
 template <>
 inline ur_result_t printFlag<ur_exp_enqueue_ext_flag_t>(std::ostream &os,
                                                         uint32_t flag);
@@ -594,6 +600,14 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct
            ur_exp_command_buffer_update_kernel_launch_desc_t params);
 inline std::ostream &operator<<(std::ostream &os,
                                 enum ur_exp_peer_info_t value);
+inline std::ostream &operator<<(std::ostream &os,
+                                enum ur_exp_kernel_arg_type_t value);
+inline std::ostream &operator<<(
+    std::ostream &os,
+    [[maybe_unused]] const struct ur_exp_kernel_arg_mem_obj_tuple_t params);
+inline std::ostream &
+operator<<(std::ostream &os,
+           [[maybe_unused]] const struct ur_exp_kernel_arg_properties_t params);
 inline std::ostream &operator<<(std::ostream &os,
                                 enum ur_exp_enqueue_ext_flag_t value);
 inline std::ostream &operator<<(
@@ -1276,6 +1290,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
   case UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP:
     os << "UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP";
     break;
+  case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP:
+    os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP";
+    break;
   default:
     os << "unknown enumerator";
     break;
@@ -1445,6 +1462,9 @@ inline std::ostream &operator<<(std::ostream &os,
   case UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES:
     os << "UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES";
     break;
+  case UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES:
+    os << "UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES";
+    break;
   default:
     os << "unknown enumerator";
     break;
@@ -1761,6 +1781,12 @@ inline ur_result_t printStruct(std::ostream &os, const void *ptr) {
         (const ur_exp_enqueue_ext_properties_t *)ptr;
     printPtr(os, pstruct);
   } break;
+
+  case UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES: {
+    const ur_exp_kernel_arg_properties_t *pstruct =
+        (const ur_exp_kernel_arg_properties_t *)ptr;
+    printPtr(os, pstruct);
+  } break;
   default:
     os << "unknown enumerator";
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
@@ -10963,13 +10989,7 @@ printUnion(std::ostream &os,
   case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
 
     os << ".clusterDim = {";
-    for (auto i = 0; i < 3; i++) {
-      if (i != 0) {
-        os << ", ";
-      }
-
-      os << (params.clusterDim[i]);
-    }
+    ur::details::printArray<3>(os, params.clusterDim);
     os << "}";
 
     break;
@@ -11653,13 +11673,7 @@ operator<<(std::ostream &os, const struct ur_exp_sampler_addr_modes_t params) {
 
   os << ", ";
   os << ".addrModes = {";
-  for (auto i = 0; i < 3; i++) {
-    if (i != 0) {
-      os << ", ";
-    }
-
-    os << (params.addrModes[i]);
-  }
+  ur::details::printArray<3>(os, params.addrModes);
   os << "}";
 
   os << "}";
@@ -12378,6 +12392,141 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr,
 }
 } // namespace ur::details
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_exp_kernel_arg_type_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os,
+                                enum ur_exp_kernel_arg_type_t value) {
+  switch (value) {
+  case UR_EXP_KERNEL_ARG_TYPE_VALUE:
+    os << "UR_EXP_KERNEL_ARG_TYPE_VALUE";
+    break;
+  case UR_EXP_KERNEL_ARG_TYPE_POINTER:
+    os << "UR_EXP_KERNEL_ARG_TYPE_POINTER";
+    break;
+  case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ:
+    os << "UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ";
+    break;
+  case UR_EXP_KERNEL_ARG_TYPE_LOCAL:
+    os << "UR_EXP_KERNEL_ARG_TYPE_LOCAL";
+    break;
+  case UR_EXP_KERNEL_ARG_TYPE_SAMPLER:
+    os << "UR_EXP_KERNEL_ARG_TYPE_SAMPLER";
+    break;
+  default:
+    os << "unknown enumerator";
+    break;
+  }
+  return os;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_exp_kernel_arg_mem_obj_tuple_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &
+operator<<(std::ostream &os,
+           const struct ur_exp_kernel_arg_mem_obj_tuple_t params) {
+  os << "(struct ur_exp_kernel_arg_mem_obj_tuple_t){";
+
+  os << ".hMem = ";
+
+  ur::details::printPtr(os, (params.hMem));
+
+  os << ", ";
+  os << ".flags = ";
+
+  ur::details::printFlag<ur_mem_flag_t>(os, (params.flags));
+
+  os << "}";
+  return os;
+}
+namespace ur::details {
+
+///////////////////////////////////////////////////////////////////////////////
+// @brief Print ur_exp_kernel_arg_value_t union
+inline ur_result_t printUnion(std::ostream &os,
+                              const union ur_exp_kernel_arg_value_t params,
+                              const enum ur_exp_kernel_arg_type_t tag) {
+  os << "(union ur_exp_kernel_arg_value_t){";
+
+  switch (tag) {
+  case UR_EXP_KERNEL_ARG_TYPE_VALUE:
+
+    os << ".value = ";
+
+    ur::details::printPtr(os, (params.value));
+
+    break;
+  case UR_EXP_KERNEL_ARG_TYPE_POINTER:
+
+    os << ".pointer = ";
+
+    ur::details::printPtr(os, (params.pointer));
+
+    break;
+  case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ:
+
+    os << ".memObjTuple = ";
+
+    os << (params.memObjTuple);
+
+    break;
+  case UR_EXP_KERNEL_ARG_TYPE_SAMPLER:
+
+    os << ".sampler = ";
+
+    ur::details::printPtr(os, (params.sampler));
+
+    break;
+  default:
+    os << "<unknown>";
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+  os << "}";
+  return UR_RESULT_SUCCESS;
+}
+} // namespace ur::details
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_exp_kernel_arg_properties_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &
+operator<<(std::ostream &os,
+           const struct ur_exp_kernel_arg_properties_t params) {
+  os << "(struct ur_exp_kernel_arg_properties_t){";
+
+  os << ".stype = ";
+
+  os << (params.stype);
+
+  os << ", ";
+  os << ".pNext = ";
+
+  ur::details::printStruct(os, (params.pNext));
+
+  os << ", ";
+  os << ".type = ";
+
+  os << (params.type);
+
+  os << ", ";
+  os << ".index = ";
+
+  os << (params.index);
+
+  os << ", ";
+  os << ".size = ";
+
+  os << (params.size);
+
+  os << ", ";
+  os << ".value = ";
+  ur::details::printUnion(os, (params.value), params.type);
+
+  os << "}";
+  return os;
+}
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_exp_enqueue_ext_flag_t type
 /// @returns
@@ -17027,6 +17176,114 @@ inline std::ostream &operator<<(
   return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the
+/// ur_enqueue_kernel_launch_with_args_exp_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &
+operator<<(std::ostream &os, [[maybe_unused]] const struct
+           ur_enqueue_kernel_launch_with_args_exp_params_t *params) {
+
+  os << ".hQueue = ";
+
+  ur::details::printPtr(os, *(params->phQueue));
+
+  os << ", ";
+  os << ".hKernel = ";
+
+  ur::details::printPtr(os, *(params->phKernel));
+
+  os << ", ";
+  os << ".workDim = ";
+
+  os << *(params->pworkDim);
+
+  os << ", ";
+  os << ".pGlobalWorkOffset = ";
+
+  ur::details::printPtr(os, *(params->ppGlobalWorkOffset));
+
+  os << ", ";
+  os << ".pGlobalWorkSize = ";
+
+  ur::details::printPtr(os, *(params->ppGlobalWorkSize));
+
+  os << ", ";
+  os << ".pLocalWorkSize = ";
+
+  ur::details::printPtr(os, *(params->ppLocalWorkSize));
+
+  os << ", ";
+  os << ".numArgs = ";
+
+  os << *(params->pnumArgs);
+
+  os << ", ";
+  os << ".pArgs = ";
+  ur::details::printPtr(os, reinterpret_cast<const void *>(*(params->ppArgs)));
+  if (*(params->ppArgs) != NULL) {
+    os << " {";
+    for (size_t i = 0; i < *params->pnumArgs; ++i) {
+      if (i != 0) {
+        os << ", ";
+      }
+
+      os << (*(params->ppArgs))[i];
+    }
+    os << "}";
+  }
+
+  os << ", ";
+  os << ".numPropsInLaunchPropList = ";
+
+  os << *(params->pnumPropsInLaunchPropList);
+
+  os << ", ";
+  os << ".launchPropList = ";
+  ur::details::printPtr(
+      os, reinterpret_cast<const void *>(*(params->plaunchPropList)));
+  if (*(params->plaunchPropList) != NULL) {
+    os << " {";
+    for (size_t i = 0; i < *params->pnumPropsInLaunchPropList; ++i) {
+      if (i != 0) {
+        os << ", ";
+      }
+
+      os << (*(params->plaunchPropList))[i];
+    }
+    os << "}";
+  }
+
+  os << ", ";
+  os << ".numEventsInWaitList = ";
+
+  os << *(params->pnumEventsInWaitList);
+
+  os << ", ";
+  os << ".phEventWaitList = ";
+  ur::details::printPtr(
+      os, reinterpret_cast<const void *>(*(params->pphEventWaitList)));
+  if (*(params->pphEventWaitList) != NULL) {
+    os << " {";
+    for (size_t i = 0; i < *params->pnumEventsInWaitList; ++i) {
+      if (i != 0) {
+        os << ", ";
+      }
+
+      ur::details::printPtr(os, (*(params->pphEventWaitList))[i]);
+    }
+    os << "}";
+  }
+
+  os << ", ";
+  os << ".phEvent = ";
+
+  ur::details::printPtr(os, *(params->pphEvent));
+
+  return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the
 /// ur_enqueue_events_wait_with_barrier_ext_params_t type
@@ -21160,6 +21417,25 @@ inline ur_result_t printPtr(std::ostream &os, const T *ptr) {
 
   return UR_RESULT_SUCCESS;
 }
+
+///////////////////////////////////////////////////////////////////////////////
+// @brief Print array of literals
+template <size_t size, typename T>
+inline ur_result_t printArray(std::ostream &os, const T *ptr) {
+  if (ptr == NULL) {
+    return printPtr(os, ptr);
+  }
+
+  for (size_t i = 0; i < size; i++) {
+    if (i != 0) {
+      os << ", ";
+    }
+
+    os << ptr[i];
+  }
+
+  return UR_RESULT_SUCCESS;
+}
 } // namespace ur::details
 
 namespace ur::extras {
@@ -21554,6 +21830,10 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os,
   case UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE: {
     os << (const struct ur_enqueue_write_host_pipe_params_t *)params;
   } break;
+  case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP: {
+    os << (const struct ur_enqueue_kernel_launch_with_args_exp_params_t *)
+            params;
+  } break;
   case UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT: {
     os << (const struct ur_enqueue_events_wait_with_barrier_ext_params_t *)
             params;
diff --git a/unified-runtime/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst b/unified-runtime/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst
new file mode 100644
index 0000000000000..703cd1e935592
--- /dev/null
+++ b/unified-runtime/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst
@@ -0,0 +1,77 @@
+<%
+    OneApi=tags['$OneApi']
+    x=tags['$x']
+    X=x.upper()
+%>
+
+.. _experimental-enqueue-kernel-launch-with-args:
+
+================================================================================
+Enqueue Kernel Launch With Args
+================================================================================
+
+.. warning::
+
+    Experimental features:
+
+    *   May be replaced, updated, or removed at any time.
+    *   Do not require maintaining API/ABI stability of their own additions over
+        time.
+    *   Do not require conformance testing of their own additions.
+
+
+
+Motivation
+--------------------------------------------------------------------------------
+
+If an application is setting a kernel's args and launching that kernel in the
+same place, we can eliminate some overhead by allowing this to be accomplished
+with one API call, rather than requiring one call for each argument and one to
+launch. This also aligns with developments in the Level Zero backend, as well
+as how CUDA and HIP handle kernel args.
+
+API
+--------------------------------------------------------------------------------
+
+Enums
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* ${x}_structure_type_t
+    ${X}_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES
+
+* ${x}_exp_kernel_arg_type_t
+    ${X}_EXP_KERNEL_ARG_TYPE_VALUE
+    ${X}_EXP_KERNEL_ARG_TYPE_POINTER
+    ${X}_EXP_KERNEL_ARG_TYPE_MEM_OBJ
+    ${X}_EXP_KERNEL_ARG_TYPE_LOCAL
+    ${X}_EXP_KERNEL_ARG_TYPE_SAMPLER
+
+Types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* ${x}_exp_kernel_arg_mem_obj_tuple_t
+* ${x}_exp_kernel_arg_value_t
+* ${x}_exp_kernel_arg_properties_t
+
+Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* ${x}EnqueueKernelLaunchWithArgsExp
+
+Changelog
+--------------------------------------------------------------------------------
+
++-----------+---------------------------------------------+
+| Revision  | Changes                                     |
++===========+=============================================+
+| 1.0       | Initial Draft                               |
++-----------+---------------------------------------------+
+
+Support
+--------------------------------------------------------------------------------
+
+Adapters must support this feature. A naive implementation can easily be
+constructed as a wrapper around the existing APIs for setting kernel args and
+launching.
+
+Contributors
+--------------------------------------------------------------------------------
+
+* Aaron Greig `aaron.greig@codeplay.com <aaron.greig@codeplay.com>`_
diff --git a/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml b/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml
new file mode 100644
index 0000000000000..6656b6a6d0299
--- /dev/null
+++ b/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml
@@ -0,0 +1,170 @@
+#
+# Copyright (C) 2025 Intel Corporation
+#
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# See YaML.md for syntax definition
+#
+--- #--------------------------------------------------------------------------
+type: header
+desc: "Intel $OneApi Unified Runtime Experimental API for setting args at kernel launch"
+ordinal: "100"
+--- #--------------------------------------------------------------------------
+type: enum
+desc: "What kind of kernel arg is this"
+class: $xKernel
+name: $x_exp_kernel_arg_type_t
+etors:
+    - name: VALUE
+      desc: "Kernel arg is a value."
+    - name: POINTER
+      desc: "Kernel arg is a pointer."
+    - name: MEM_OBJ
+      desc: "Kernel arg is a memory object."
+    - name: LOCAL
+      desc: "Kernel arg is a local allocation."
+    - name: SAMPLER
+      desc: "Kernel arg is a sampler."
+--- #--------------------------------------------------------------------------
+type: struct
+desc: "Mem obj/properties tuple"
+name: $x_exp_kernel_arg_mem_obj_tuple_t
+members:
+    - type: $x_mem_handle_t
+      name: hMem
+      desc: "[in] Handle of a memory object"
+    - type: $x_mem_flags_t
+      name: flags
+      desc: "[in] Memory flags to associate with `hMem`. Allowed values are: $X_MEM_FLAG_READ_WRITE, $X_MEM_FLAG_WRITE_ONLY, $X_MEM_FLAG_READ_ONLY."
+--- #--------------------------------------------------------------------------
+# We have redundant entries in the union (value + pointer) to make printing work
+# as it relies on the tags and we can currently only have one tag per member.
+type: union
+desc: "Typesafe container for a kernel argument value"
+name: $x_exp_kernel_arg_value_t
+tag: $x_exp_kernel_arg_type_t
+members:
+    - type: "const void*"
+      name: value
+      desc: |
+         [in] argument value represented as matching arg type.
+         The data pointed to will be copied and therefore can be reused on return.
+      tag: $X_EXP_KERNEL_ARG_TYPE_VALUE
+    - type: "const void*"
+      name: pointer
+      desc: "[in] Allocation obtained by USM allocation or virtual memory mapping operation, or pointer to a literal value."
+      tag: $X_EXP_KERNEL_ARG_TYPE_POINTER
+    - type: $x_exp_kernel_arg_mem_obj_tuple_t
+      name: memObjTuple
+      desc: "[in] Struct containing a memory object and associated flags."
+      tag: $X_EXP_KERNEL_ARG_TYPE_MEM_OBJ
+    - type: $x_sampler_handle_t
+      name: sampler
+      desc: "[in] Handle of a sampler object."
+      tag: $X_EXP_KERNEL_ARG_TYPE_SAMPLER
+--- #--------------------------------------------------------------------------
+type: struct
+desc: "Kernel arg properties"
+name: $x_exp_kernel_arg_properties_t
+base: $x_base_properties_t
+members:
+    - type: $x_exp_kernel_arg_type_t
+      name: type
+      desc: "[in] type of the kernel arg"
+    - type: uint32_t
+      name: index
+      desc: "[in] index of the kernel arg"
+    - type: size_t
+      name: size
+      desc: "[in] size of the kernel arg"
+    - type: $x_exp_kernel_arg_value_t
+      name: value
+      desc: "[in][tagged_by(type)] Union containing the argument value."
+--- #--------------------------------------------------------------------------
+type: enum
+extend: true
+desc: "Structure Type experimental enumerations."
+name: $x_structure_type_t
+etors:
+    - name: EXP_KERNEL_ARG_PROPERTIES
+      desc: $x_exp_kernel_arg_properties_t
+      value: "0x5000"
+--- #--------------------------------------------------------------------------
+type: function
+desc: "Enqueue a command to execute a kernel"
+class: $xEnqueue
+name: KernelLaunchWithArgsExp
+ordinal: "0"
+analogue:
+    - "**clEnqueueNDRangeKernel**"
+params:
+    - type: $x_queue_handle_t
+      name: hQueue
+      desc: "[in] handle of the queue object"
+    - type: $x_kernel_handle_t
+      name: hKernel
+      desc: "[in] handle of the kernel object"
+    - type: uint32_t
+      name: workDim
+      desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
+    - type: "const size_t*"
+      name: pGlobalWorkOffset
+      desc: "[in][optional] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
+    - type: "const size_t*"
+      name: pGlobalWorkSize
+      desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
+    - type: "const size_t*"
+      name: pLocalWorkSize
+      desc: |
+            [in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function.
+            If nullptr, the runtime implementation will choose the work-group size.
+    - type: uint32_t
+      name: numArgs
+      desc: "[in] Number of entries in pArgs"
+    - type: "const $x_exp_kernel_arg_properties_t*"
+      name: pArgs
+      desc: "[in][optional][range(0, numArgs)] pointer to a list of kernel arg properties."
+    - type: uint32_t
+      name: numPropsInLaunchPropList
+      desc: "[in] size of the launch prop list"
+    - type: const $x_kernel_launch_property_t*
+      name: launchPropList
+      desc: "[in][optional][range(0, numPropsInLaunchPropList)] pointer to a list of launch properties"
+    - type: uint32_t
+      name: numEventsInWaitList
+      desc: "[in] size of the event wait list"
+    - type: "const $x_event_handle_t*"
+      name: phEventWaitList
+      desc: |
+            [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution.
+            If nullptr, the numEventsInWaitList must be 0, indicating that no wait event.
+    - type: $x_event_handle_t*
+      name: phEvent
+      desc: |
+            [out][optional][alloc] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array.
+returns:
+    - $X_RESULT_ERROR_INVALID_QUEUE
+    - $X_RESULT_ERROR_INVALID_KERNEL
+    - $X_RESULT_ERROR_INVALID_EVENT
+    - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
+        - "`phEventWaitList == NULL && numEventsInWaitList > 0`"
+        - "`phEventWaitList != NULL && numEventsInWaitList == 0`"
+        - "If event objects in phEventWaitList are not valid events."
+    - $X_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS:
+        - "An event in `phEventWaitList` has $X_EVENT_STATUS_ERROR."
+    - $X_RESULT_ERROR_INVALID_WORK_DIMENSION:
+        - "`pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || pGlobalWorkSize[2] == 0`"
+    - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
+        - "`pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == 0 || pLocalWorkSize[2] == 0)`"
+    - $X_RESULT_ERROR_INVALID_VALUE
+    - $X_RESULT_ERROR_INVALID_KERNEL_ARGS
+        - "The kernel argument values have not been specified."
+    - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
+    - $X_RESULT_ERROR_OUT_OF_RESOURCES
+    - $X_RESULT_ERROR_INVALID_OPERATION:
+        - "If any property in `launchPropList` isn't supported by the device."
+    - $X_RESULT_ERROR_INVALID_NULL_POINTER:
+        - "`launchPropList == NULL && numPropsInLaunchPropList > 0`"
+        - "`pArgs == NULL && numArgs > 0`"
diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml
index baefa0ad0903d..580a9a1dae741 100644
--- a/unified-runtime/scripts/core/registry.yml
+++ b/unified-runtime/scripts/core/registry.yml
@@ -670,7 +670,10 @@ etors:
 - name: BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP
   desc: Enumerator for $xBindlessImagesSupportsImportingHandleTypeExp
   value: '288'
-max_id: '288'
+- name: ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP
+  desc: Enumerator for $xEnqueueKernelLaunchWithArgsExp
+  value: '289'
+max_id: '289'
 ---
 type: enum
 desc: Defines structure types
diff --git a/unified-runtime/scripts/parse_specs.py b/unified-runtime/scripts/parse_specs.py
index 9fdb69eedc473..8adbc6de19f4a 100644
--- a/unified-runtime/scripts/parse_specs.py
+++ b/unified-runtime/scripts/parse_specs.py
@@ -909,7 +909,9 @@ def _append(lst, key, val):
                 rets[idx][key].append(val)
 
         def append_nullchecks(param, accessor: str):
-            if type_traits.is_pointer(param["type"]):
+            if type_traits.is_pointer(param["type"]) or type_traits.is_array(
+                param["type"]
+            ):
                 _append(
                     rets,
                     "$X_RESULT_ERROR_INVALID_NULL_POINTER",
diff --git a/unified-runtime/scripts/templates/helper.py b/unified-runtime/scripts/templates/helper.py
index 00de01e34753a..5b4fc0c18c2c5 100644
--- a/unified-runtime/scripts/templates/helper.py
+++ b/unified-runtime/scripts/templates/helper.py
@@ -987,12 +987,17 @@ def make_param_lines(
 
         words = []
         if "type*" in format:
-            words.append(tname + "*")
+            ptname = tname + "*"
+            if type_traits.is_array(item["type"]):
+                ptname += "*"
+            words.append(ptname)
             name = "p" + name
         elif "type" in format:
             words.append(tname)
         if "name" in format:
             words.append(name)
+        if type_traits.is_array(item["type"]) and "type" in format:
+            words.append(f"[{type_traits.get_array_length(item['type'])}]")
 
         prologue = " ".join(words)
         if "delim" in format:
diff --git a/unified-runtime/scripts/templates/print.hpp.mako b/unified-runtime/scripts/templates/print.hpp.mako
index 4481847130cbe..ed94cd9227996 100644
--- a/unified-runtime/scripts/templates/print.hpp.mako
+++ b/unified-runtime/scripts/templates/print.hpp.mako
@@ -99,14 +99,7 @@ def findMemberType(_item):
         ${x}::details::printUnion(os, ${deref}(params${access}${item['name']}), params${access}${th.param_traits.tagged_member(item)});
     %elif th.type_traits.is_array(item['type']):
         os << ".${iname} = {";
-        for(auto i = 0; i < ${th.type_traits.get_array_length(item['type'])}; i++){
-            if(i != 0){
-                os << ", ";
-            }
-            <%call expr="member(iname, itype, True)">
-                ${deref}(params${access}${item['name']}[i])
-            </%call>
-        }
+        ${x}::details::printArray<${th.type_traits.get_array_length(item['type'])}>(os, ${deref}params${access}${pname});
         os << "}";
     %elif typename is not None:
         os << ".${iname} = ";
@@ -140,6 +133,7 @@ inline constexpr bool is_handle_v = is_handle<T>::value;
 template <typename T> inline ${x}_result_t printPtr(std::ostream &os, const T *ptr);
 template <typename T> inline ${x}_result_t printFlag(std::ostream &os, uint32_t flag);
 template <typename T> inline ${x}_result_t printTagged(std::ostream &os, const void *ptr, T value, size_t size);
+template <size_t size, typename T> inline ur_result_t printArray(std::ostream &os, const T *ptr);
 
 %for spec in specs:
 %for obj in spec['objects']:
@@ -564,6 +558,25 @@ template <typename T> inline ${x}_result_t printPtr(std::ostream &os, const T *p
 
     return ${X}_RESULT_SUCCESS;
 }
+
+///////////////////////////////////////////////////////////////////////////////
+// @brief Print array of literals
+template <size_t size, typename T>
+inline ur_result_t printArray(std::ostream &os, const T *ptr) {
+  if(ptr == NULL)  {
+    return printPtr(os, ptr);
+  }
+
+  for (size_t i = 0; i < size; i++) {
+    if (i != 0) {
+      os << ", ";
+    }
+
+    os << ptr[i];
+  }
+
+  return ${X}_RESULT_SUCCESS;
+}
 } // namespace ${x}::details
 
 namespace ${x}::extras {
diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp
index 36f7535be74f5..e4ccb670e0291 100644
--- a/unified-runtime/source/adapters/cuda/enqueue.cpp
+++ b/unified-runtime/source/adapters/cuda/enqueue.cpp
@@ -15,6 +15,7 @@
 #include "kernel.hpp"
 #include "memory.hpp"
 #include "queue.hpp"
+#include "sampler.hpp"
 
 #include <cmath>
 #include <cuda.h>
@@ -632,6 +633,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 #endif // CUDA_VERSION >= 11080
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numArgs,
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    uint32_t numPropsInLaunchPropList,
+    const ur_kernel_launch_property_t *launchPropList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  try {
+    for (uint32_t i = 0; i < numArgs; i++) {
+      switch (pArgs[i].type) {
+      case UR_EXP_KERNEL_ARG_TYPE_LOCAL: {
+        hKernel->setKernelLocalArg(pArgs[i].index, pArgs[i].size);
+        break;
+      }
+      case UR_EXP_KERNEL_ARG_TYPE_VALUE: {
+        hKernel->setKernelArg(pArgs[i].index, pArgs[i].size,
+                              pArgs[i].value.value);
+        break;
+      }
+      case UR_EXP_KERNEL_ARG_TYPE_POINTER: {
+        // setKernelArg is expecting a pointer to our argument
+        hKernel->setKernelArg(pArgs[i].index, pArgs[i].size,
+                              &pArgs[i].value.pointer);
+        break;
+      }
+      case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
+        ur_kernel_arg_mem_obj_properties_t Props = {
+            UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr,
+            pArgs[i].value.memObjTuple.flags};
+        UR_CALL(urKernelSetArgMemObj(hKernel, pArgs[i].index, &Props,
+                                     pArgs[i].value.memObjTuple.hMem));
+        break;
+      }
+      case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
+        uint32_t SamplerProps = pArgs[i].value.sampler->Props;
+        hKernel->setKernelArg(pArgs[i].index, sizeof(uint32_t),
+                              (void *)&SamplerProps);
+        break;
+      }
+      default:
+        return UR_RESULT_ERROR_INVALID_ENUMERATION;
+      }
+    }
+  } catch (ur_result_t Err) {
+    return Err;
+  }
+  return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numPropsInLaunchPropList, launchPropList,
+                               numEventsInWaitList, phEventWaitList, phEvent);
+}
+
 /// Set parameters for general 3D memory copy.
 /// If the source and/or destination is on the device, SrcPtr and/or DstPtr
 /// must be a pointer to a CUdeviceptr
diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp
index 8430df0ab0678..a9b072472b7c9 100644
--- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp
@@ -454,6 +454,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
   pDdiTable->pfnUSMHostAllocExp = urEnqueueUSMHostAllocExp;
   pDdiTable->pfnUSMFreeExp = urEnqueueUSMFreeExp;
   pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp;
+  pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/unified-runtime/source/adapters/hip/enqueue.cpp b/unified-runtime/source/adapters/hip/enqueue.cpp
index 2206fbbf3eb3a..89b45d9d29b2a 100644
--- a/unified-runtime/source/adapters/hip/enqueue.cpp
+++ b/unified-runtime/source/adapters/hip/enqueue.cpp
@@ -16,6 +16,7 @@
 #include "logger/ur_logger.hpp"
 #include "memory.hpp"
 #include "queue.hpp"
+#include "sampler.hpp"
 #include "ur_api.h"
 
 #include <ur/ur.hpp>
@@ -340,6 +341,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numArgs,
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    uint32_t numPropsInLaunchPropList,
+    const ur_kernel_launch_property_t *launchPropList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  try {
+    for (uint32_t i = 0; i < numArgs; i++) {
+      switch (pArgs[i].type) {
+      case UR_EXP_KERNEL_ARG_TYPE_LOCAL: {
+        hKernel->setKernelLocalArg(pArgs[i].index, pArgs[i].size);
+        break;
+      }
+      case UR_EXP_KERNEL_ARG_TYPE_VALUE: {
+        hKernel->setKernelArg(pArgs[i].index, pArgs[i].size,
+                              pArgs[i].value.value);
+        break;
+      }
+      case UR_EXP_KERNEL_ARG_TYPE_POINTER: {
+        // setKernelArg is expecting a pointer to our argument
+        hKernel->setKernelArg(pArgs[i].index, pArgs[i].size,
+                              &pArgs[i].value.pointer);
+        break;
+      }
+      case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
+        ur_kernel_arg_mem_obj_properties_t Props = {
+            UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr,
+            pArgs[i].value.memObjTuple.flags};
+        UR_CALL(urKernelSetArgMemObj(hKernel, pArgs[i].index, &Props,
+                                     pArgs[i].value.memObjTuple.hMem));
+        break;
+      }
+      case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
+        uint32_t SamplerProps = pArgs[i].value.sampler->Props;
+        hKernel->setKernelArg(pArgs[i].index, sizeof(uint32_t),
+                              (void *)&SamplerProps);
+        break;
+      }
+      default:
+        return UR_RESULT_ERROR_INVALID_ENUMERATION;
+      }
+    }
+  } catch (ur_result_t Err) {
+    return Err;
+  }
+  return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numPropsInLaunchPropList, launchPropList,
+                               numEventsInWaitList, phEventWaitList, phEvent);
+}
+
 /// Enqueues a wait on the given queue for all events.
 /// See \ref enqueueEventWait
 ///
diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp
index dfb4382cad828..d8ec6bb3b50c9 100644
--- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp
@@ -447,6 +447,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
   pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
   pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp;
   pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp;
+  pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/unified-runtime/source/adapters/level_zero/kernel.cpp b/unified-runtime/source/adapters/level_zero/kernel.cpp
index bcac9cb04c320..1a5d02da15d37 100644
--- a/unified-runtime/source/adapters/level_zero/kernel.cpp
+++ b/unified-runtime/source/adapters/level_zero/kernel.cpp
@@ -56,6 +56,173 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(
   return UR_RESULT_SUCCESS;
 }
 
+inline ur_result_t KernelSetArgValueHelper(
+    ur_kernel_handle_t Kernel,
+    /// [in] argument index in range [0, num args - 1]
+    uint32_t ArgIndex,
+    /// [in] size of argument type
+    size_t ArgSize,
+    /// [in] argument value represented as matching arg type.
+    const void *PArgValue) {
+  // OpenCL: "the arg_value pointer can be NULL or point to a NULL value
+  // in which case a NULL value will be used as the value for the argument
+  // declared as a pointer to global or constant memory in the kernel"
+  //
+  // We don't know the type of the argument but it seems that the only time
+  // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument
+  // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL.
+  if (ArgSize == sizeof(void *) && PArgValue &&
+      *(void **)(const_cast<void *>(PArgValue)) == nullptr) {
+    PArgValue = nullptr;
+  }
+
+  if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) {
+    return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX;
+  }
+
+  ze_result_t ZeResult = ZE_RESULT_SUCCESS;
+  if (Kernel->ZeKernelMap.empty()) {
+    auto ZeKernel = Kernel->ZeKernel;
+    ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue,
+                               (ZeKernel, ArgIndex, ArgSize, PArgValue));
+  } else {
+    for (auto It : Kernel->ZeKernelMap) {
+      auto ZeKernel = It.second;
+      ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue,
+                                 (ZeKernel, ArgIndex, ArgSize, PArgValue));
+    }
+  }
+
+  if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) {
+    return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE;
+  }
+
+  return ze2urResult(ZeResult);
+}
+
+inline ur_result_t KernelSetArgMemObjHelper(
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t Kernel,
+    /// [in] argument index in range [0, num args - 1]
+    uint32_t ArgIndex,
+    /// [in][optional] pointer to Memory object properties.
+    const ur_kernel_arg_mem_obj_properties_t *Properties,
+    /// [in][optional] handle of Memory object.
+    ur_mem_handle_t ArgValue) {
+  // The ArgValue may be a NULL pointer in which case a NULL value is used for
+  // the kernel argument declared as a pointer to global or constant memory.
+
+  if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) {
+    return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX;
+  }
+
+  ur_mem_handle_t_ *UrMem = ur_cast<ur_mem_handle_t_ *>(ArgValue);
+
+  ur_mem_handle_t_::access_mode_t UrAccessMode = ur_mem_handle_t_::read_write;
+  if (Properties) {
+    switch (Properties->memoryAccess) {
+    case UR_MEM_FLAG_READ_WRITE:
+      UrAccessMode = ur_mem_handle_t_::read_write;
+      break;
+    case UR_MEM_FLAG_WRITE_ONLY:
+      UrAccessMode = ur_mem_handle_t_::write_only;
+      break;
+    case UR_MEM_FLAG_READ_ONLY:
+      UrAccessMode = ur_mem_handle_t_::read_only;
+      break;
+    case 0:
+      break;
+    default:
+      return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+  }
+  auto Arg = UrMem ? UrMem : nullptr;
+  Kernel->PendingArguments.push_back(
+      {ArgIndex, sizeof(void *), Arg, UrAccessMode});
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t Queue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t Kernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *GlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *GlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *LocalWorkSize,
+    /// [in] size of the event wait list
+    uint32_t NumArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *Args,
+    /// [in] size of the launch prop list
+    uint32_t NumPropsInLaunchPropList,
+    /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch
+    /// properties
+    const ur_kernel_launch_property_t *LaunchPropList,
+    uint32_t NumEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution. If
+    /// nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *EventWaitList,
+    /// [in,out][optional] return an event object that identifies this
+    /// particular kernel execution instance.
+    ur_event_handle_t *OutEvent) {
+  {
+    std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+    for (uint32_t i = 0; i < NumArgs; i++) {
+      switch (Args[i].type) {
+      case UR_EXP_KERNEL_ARG_TYPE_LOCAL:
+        UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size,
+                                        nullptr));
+        break;
+      case UR_EXP_KERNEL_ARG_TYPE_VALUE:
+        UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size,
+                                        Args[i].value.value));
+        break;
+      case UR_EXP_KERNEL_ARG_TYPE_POINTER:
+        UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size,
+                                        &Args[i].value.pointer));
+        break;
+      case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
+        ur_kernel_arg_mem_obj_properties_t Properties = {
+            UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr,
+            Args[i].value.memObjTuple.flags};
+        UR_CALL(KernelSetArgMemObjHelper(Kernel, Args[i].index, &Properties,
+                                         Args[i].value.memObjTuple.hMem));
+        break;
+      }
+      case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
+        UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size,
+                                        &Args[i].value.sampler->ZeSampler));
+        break;
+      }
+      default:
+        return UR_RESULT_ERROR_INVALID_ENUMERATION;
+      }
+    }
+  }
+  // Normalize so each dimension has at least one work item
+  return level_zero::urEnqueueKernelLaunch(
+      Queue, Kernel, workDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
+      NumPropsInLaunchPropList, LaunchPropList, NumEventsInWaitList,
+      EventWaitList, OutEvent);
+}
+
 ur_result_t urEnqueueKernelLaunch(
     /// [in] handle of the queue object
     ur_queue_handle_t Queue,
diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp
index 13d7274e7aebf..4276f97f5dd29 100644
--- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp
@@ -225,6 +225,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     return result;
   }
 
+  pDdiTable->pfnKernelLaunchWithArgsExp =
+      ur::level_zero::urEnqueueKernelLaunchWithArgsExp;
   pDdiTable->pfnUSMDeviceAllocExp = ur::level_zero::urEnqueueUSMDeviceAllocExp;
   pDdiTable->pfnUSMSharedAllocExp = ur::level_zero::urEnqueueUSMSharedAllocExp;
   pDdiTable->pfnUSMHostAllocExp = ur::level_zero::urEnqueueUSMHostAllocExp;
diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp
index 77bc0b7d5b737..bf1cb230dfc6c 100644
--- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp
@@ -806,6 +806,15 @@ ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice,
                                          ur_exp_peer_info_t propName,
                                          size_t propSize, void *pPropValue,
                                          size_t *pPropSizeRet);
+ur_result_t urEnqueueKernelLaunchWithArgsExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numArgs,
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    uint32_t numPropsInLaunchPropList,
+    const ur_kernel_launch_property_t *launchPropList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent);
 ur_result_t urEnqueueEventsWaitWithBarrierExt(
     ur_queue_handle_t hQueue,
     const ur_exp_enqueue_ext_properties_t *pProperties,
diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp
index 142caaecc1a71..3b4bbc01e2b18 100644
--- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp
+++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp
@@ -11,6 +11,7 @@
 #include "command_list_manager.hpp"
 #include "../helpers/kernel_helpers.hpp"
 #include "../helpers/memory_helpers.hpp"
+#include "../sampler.hpp"
 #include "../ur_interface_loader.hpp"
 #include "command_buffer.hpp"
 #include "context.hpp"
@@ -1037,3 +1038,60 @@ ur_result_t ur_command_list_manager::releaseSubmittedKernels() {
   submittedKernels.clear();
   return UR_RESULT_SUCCESS;
 }
+
+ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp(
+    ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numArgs,
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    uint32_t numPropsInLaunchPropList,
+    const ur_kernel_launch_property_t *launchPropList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t phEvent) {
+  TRACK_SCOPE_LATENCY(
+      "ur_queue_immediate_in_order_t::enqueueKernelLaunchWithArgsExp");
+  {
+    std::scoped_lock<ur_shared_mutex> guard(hKernel->Mutex);
+    for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) {
+      switch (pArgs[argIndex].type) {
+      case UR_EXP_KERNEL_ARG_TYPE_LOCAL:
+        UR_CALL(hKernel->setArgValue(pArgs[argIndex].index,
+                                     pArgs[argIndex].size, nullptr, nullptr));
+        break;
+      case UR_EXP_KERNEL_ARG_TYPE_VALUE:
+        UR_CALL(hKernel->setArgValue(pArgs[argIndex].index,
+                                     pArgs[argIndex].size, nullptr,
+                                     pArgs[argIndex].value.value));
+        break;
+      case UR_EXP_KERNEL_ARG_TYPE_POINTER:
+        UR_CALL(hKernel->setArgPointer(pArgs[argIndex].index, nullptr,
+                                       pArgs[argIndex].value.pointer));
+        break;
+      case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ:
+        // TODO: import helper for converting ur flags to internal equivalent
+        UR_CALL(hKernel->addPendingMemoryAllocation(
+            {pArgs[argIndex].value.memObjTuple.hMem,
+             ur_mem_buffer_t::device_access_mode_t::read_write,
+             pArgs[argIndex].index}));
+        break;
+      case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
+        UR_CALL(
+            hKernel->setArgValue(argIndex, sizeof(void *), nullptr,
+                                 &pArgs[argIndex].value.sampler->ZeSampler));
+        break;
+      }
+      default:
+        return UR_RESULT_ERROR_INVALID_ENUMERATION;
+      }
+    }
+  }
+
+  UR_CALL(appendKernelLaunch(hKernel, workDim, pGlobalWorkOffset,
+                             pGlobalWorkSize, pLocalWorkSize,
+                             numPropsInLaunchPropList, launchPropList,
+                             numEventsInWaitList, phEventWaitList, phEvent));
+
+  recordSubmittedKernel(hKernel);
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp
index 3c1bbd710ed47..8ace46c88837f 100644
--- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp
+++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp
@@ -233,6 +233,16 @@ struct ur_command_list_manager {
                                const ur_event_handle_t *phEventWaitList,
                                ur_event_handle_t phEvent);
 
+  ur_result_t appendKernelLaunchWithArgsExp(
+      ur_kernel_handle_t hKernel, uint32_t workDim,
+      const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+      const size_t *pLocalWorkSize, uint32_t numArgs,
+      const ur_exp_kernel_arg_properties_t *pArgs,
+      uint32_t numPropsInLaunchPropList,
+      const ur_kernel_launch_property_t *launchPropList,
+      uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+      ur_event_handle_t phEvent);
+
 private:
   ur_result_t appendGenericCommandListsExp(
       uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists,
diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp
index 8809abe194362..0344323ab4f89 100644
--- a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp
+++ b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp
@@ -436,19 +436,17 @@ ur_result_t urKernelSetArgPointer(
   return exceptionToResult(std::current_exception());
 }
 
-static ur_mem_buffer_t::device_access_mode_t memAccessFromKernelProperties(
-    const ur_kernel_arg_mem_obj_properties_t *pProperties) {
-  if (pProperties) {
-    switch (pProperties->memoryAccess) {
-    case UR_MEM_FLAG_READ_WRITE:
-      return ur_mem_buffer_t::device_access_mode_t::read_write;
-    case UR_MEM_FLAG_WRITE_ONLY:
-      return ur_mem_buffer_t::device_access_mode_t::write_only;
-    case UR_MEM_FLAG_READ_ONLY:
-      return ur_mem_buffer_t::device_access_mode_t::read_only;
-    default:
-      return ur_mem_buffer_t::device_access_mode_t::read_write;
-    }
+static ur_mem_buffer_t::device_access_mode_t
+memAccessFromKernelProperties(const ur_mem_flags_t &Flags) {
+  switch (Flags) {
+  case UR_MEM_FLAG_READ_WRITE:
+    return ur_mem_buffer_t::device_access_mode_t::read_write;
+  case UR_MEM_FLAG_WRITE_ONLY:
+    return ur_mem_buffer_t::device_access_mode_t::write_only;
+  case UR_MEM_FLAG_READ_ONLY:
+    return ur_mem_buffer_t::device_access_mode_t::read_only;
+  default:
+    return ur_mem_buffer_t::device_access_mode_t::read_write;
   }
   return ur_mem_buffer_t::device_access_mode_t::read_write;
 }
@@ -462,7 +460,10 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
   std::scoped_lock<ur_shared_mutex> guard(hKernel->Mutex);
 
   UR_CALL(hKernel->addPendingMemoryAllocation(
-      {hArgValue, memAccessFromKernelProperties(pProperties), argIndex}));
+      {hArgValue,
+       memAccessFromKernelProperties(pProperties ? pProperties->memoryAccess
+                                                 : 0),
+       argIndex}));
 
   return UR_RESULT_SUCCESS;
 } catch (...) {
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp
index 582885ea67c46..29a8bf7114862 100644
--- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp
+++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp
@@ -442,6 +442,22 @@ ur_result_t urEnqueueTimestampRecordingExp(
 } catch (...) {
   return exceptionToResult(std::current_exception());
 }
+ur_result_t urEnqueueKernelLaunchWithArgsExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numArgs,
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    uint32_t numPropsInLaunchPropList,
+    const ur_kernel_launch_property_t *launchPropList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) try {
+  return hQueue->get().enqueueKernelLaunchWithArgsExp(
+      hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+      numArgs, pArgs, numPropsInLaunchPropList, launchPropList,
+      numEventsInWaitList, phEventWaitList, phEvent);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
 ur_result_t urEnqueueEventsWaitWithBarrierExt(
     ur_queue_handle_t hQueue,
     const ur_exp_enqueue_ext_properties_t *pProperties,
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp
index 4bd9d8fd2141e..530897288f0fa 100644
--- a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp
+++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp
@@ -163,6 +163,11 @@ struct ur_queue_t_ {
   virtual ur_result_t enqueueTimestampRecordingExp(bool, uint32_t,
                                                    const ur_event_handle_t *,
                                                    ur_event_handle_t *) = 0;
+  virtual ur_result_t enqueueKernelLaunchWithArgsExp(
+      ur_kernel_handle_t, uint32_t, const size_t *, const size_t *,
+      const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *,
+      uint32_t, const ur_kernel_launch_property_t *, uint32_t,
+      const ur_event_handle_t *, ur_event_handle_t *) = 0;
   virtual ur_result_t
   enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *,
                                   uint32_t, const ur_event_handle_t *,
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
index 3f230861ad563..7f40d9139d63a 100644
--- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
+++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
@@ -454,6 +454,22 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ {
         createEventIfRequested(eventPool.get(), phEvent, this));
   }
 
+  ur_result_t enqueueKernelLaunchWithArgsExp(
+      ur_kernel_handle_t hKernel, uint32_t workDim,
+      const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+      const size_t *pLocalWorkSize, uint32_t numArgs,
+      const ur_exp_kernel_arg_properties_t *pArgs,
+      uint32_t numPropsInLaunchPropList,
+      const ur_kernel_launch_property_t *launchPropList,
+      uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+      ur_event_handle_t *phEvent) override {
+    return commandListManager.lock()->appendKernelLaunchWithArgsExp(
+        hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+        numArgs, pArgs, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList,
+        createEventIfRequested(eventPool.get(), phEvent, this));
+  }
+
   ur::RefCount RefCount;
 };
 
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp
index f1ad68a62a1a8..564c0472edb81 100644
--- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp
+++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp
@@ -506,6 +506,24 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ {
         createEventIfRequested(eventPool.get(), phEvent, this));
   }
 
+  ur_result_t enqueueKernelLaunchWithArgsExp(
+      ur_kernel_handle_t hKernel, uint32_t workDim,
+      const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+      const size_t *pLocalWorkSize, uint32_t numArgs,
+      const ur_exp_kernel_arg_properties_t *pArgs,
+      uint32_t numPropsInLaunchPropList,
+      const ur_kernel_launch_property_t *launchPropList,
+      uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+      ur_event_handle_t *phEvent) override {
+    auto commandListId = getNextCommandListId();
+    return commandListManagers.lock()[commandListId]
+        .appendKernelLaunchWithArgsExp(
+            hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+            pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList,
+            launchPropList, numEventsInWaitList, phEventWaitList,
+            createEventIfRequested(eventPool.get(), phEvent, this));
+  }
+
   ur::RefCount RefCount;
 };
 
diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp
index bc1ef8ae6dc86..71778bec2dacc 100644
--- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp
+++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp
@@ -11919,6 +11919,107 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
   return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list
+    /// of launch properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution.
+    /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional][alloc] return an event object that identifies this
+    /// particular kernel execution instance. If phEventWaitList and phEvent
+    /// are not NULL, phEvent must not refer to an element of the
+    /// phEventWaitList array.
+    ur_event_handle_t *phEvent) try {
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  ur_enqueue_kernel_launch_with_args_exp_params_t params = {
+      &hQueue,
+      &hKernel,
+      &workDim,
+      &pGlobalWorkOffset,
+      &pGlobalWorkSize,
+      &pLocalWorkSize,
+      &numArgs,
+      &pArgs,
+      &numPropsInLaunchPropList,
+      &launchPropList,
+      &numEventsInWaitList,
+      &phEventWaitList,
+      &phEvent};
+
+  auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
+      mock::getCallbacks().get_before_callback(
+          "urEnqueueKernelLaunchWithArgsExp"));
+  if (beforeCallback) {
+    result = beforeCallback(&params);
+    if (result != UR_RESULT_SUCCESS) {
+      return result;
+    }
+  }
+
+  auto replaceCallback = reinterpret_cast<ur_mock_callback_t>(
+      mock::getCallbacks().get_replace_callback(
+          "urEnqueueKernelLaunchWithArgsExp"));
+  if (replaceCallback) {
+    result = replaceCallback(&params);
+  } else {
+
+    // optional output handle
+    if (phEvent) {
+      *phEvent = mock::createDummyHandle<ur_event_handle_t>();
+    }
+    result = UR_RESULT_SUCCESS;
+  }
+
+  if (result != UR_RESULT_SUCCESS) {
+    return result;
+  }
+
+  auto afterCallback = reinterpret_cast<ur_mock_callback_t>(
+      mock::getCallbacks().get_after_callback(
+          "urEnqueueKernelLaunchWithArgsExp"));
+  if (afterCallback) {
+    return afterCallback(&params);
+  }
+
+  return result;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt
 __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt(
@@ -12424,6 +12525,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   ur_result_t result = UR_RESULT_SUCCESS;
 
+  pDdiTable->pfnKernelLaunchWithArgsExp =
+      driver::urEnqueueKernelLaunchWithArgsExp;
+
   pDdiTable->pfnUSMDeviceAllocExp = driver::urEnqueueUSMDeviceAllocExp;
 
   pDdiTable->pfnUSMSharedAllocExp = driver::urEnqueueUSMSharedAllocExp;
diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp
index 6818a915b9334..f1e7ea3c31f4d 100644
--- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp
+++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp
@@ -705,3 +705,45 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp(
     const ur_event_handle_t *, ur_event_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numArgs,
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    uint32_t numPropsInLaunchPropList,
+    const ur_kernel_launch_property_t *launchPropList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) {
+    switch (pArgs[argIndex].type) {
+    case UR_EXP_KERNEL_ARG_TYPE_VALUE:
+      UR_CALL(hKernel->addArg(pArgs[argIndex].value.value,
+                              pArgs[argIndex].index, pArgs[argIndex].size));
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_POINTER:
+      UR_CALL(
+          hKernel->addPtrArg(const_cast<void *>(pArgs[argIndex].value.pointer),
+                             pArgs[argIndex].index));
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
+      auto MemObj = pArgs[argIndex].value.memObjTuple.hMem;
+      UR_CALL(hKernel->addMemObjArg(MemObj, pArgs[argIndex].index));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_LOCAL:
+      UR_CALL(
+          hKernel->addLocalArg(pArgs[argIndex].index, pArgs[argIndex].size));
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_SAMPLER:
+      return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+      break;
+    default:
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  }
+  return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numPropsInLaunchPropList, launchPropList,
+                               numEventsInWaitList, phEventWaitList, phEvent);
+}
diff --git a/unified-runtime/source/adapters/native_cpu/kernel.cpp b/unified-runtime/source/adapters/native_cpu/kernel.cpp
index ac11331357f39..f67c7653d0981 100644
--- a/unified-runtime/source/adapters/native_cpu/kernel.cpp
+++ b/unified-runtime/source/adapters/native_cpu/kernel.cpp
@@ -61,21 +61,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
   // TODO: error checking
 
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
 
-  hKernel->addArg(pArgValue, argIndex, argSize);
-
-  return UR_RESULT_SUCCESS;
+  return hKernel->addArg(pArgValue, argIndex, argSize);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal(
     ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize,
     const ur_kernel_arg_local_properties_t * /*pProperties*/) {
-  // emplace a placeholder kernel arg, gets replaced with a pointer to the
-  // memory pool before enqueueing the kernel.
-  hKernel->addPtrArg(nullptr, argIndex);
-  hKernel->_localArgInfo.emplace_back(argIndex, argSize);
-  return UR_RESULT_SUCCESS;
+  return hKernel->addLocalArg(argIndex, argSize);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
@@ -211,11 +204,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
     const void *pArgValue) {
 
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(pArgValue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
-
-  hKernel->addPtrArg(const_cast<void *>(pArgValue), argIndex);
 
-  return UR_RESULT_SUCCESS;
+  return hKernel->addPtrArg(const_cast<void *>(pArgValue), argIndex);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -242,16 +232,7 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
 
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  // Taken from ur/adapters/cuda/kernel.cpp
-  // zero-sized buffers are expected to be null.
-  if (hArgValue == nullptr) {
-    hKernel->addPtrArg(nullptr, argIndex);
-    return UR_RESULT_SUCCESS;
-  }
-
-  hKernel->addArgReference(hArgValue);
-  hKernel->addPtrArg(hArgValue->_mem, argIndex);
-  return UR_RESULT_SUCCESS;
+  return hKernel->addMemObjArg(hArgValue, argIndex);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
diff --git a/unified-runtime/source/adapters/native_cpu/kernel.hpp b/unified-runtime/source/adapters/native_cpu/kernel.hpp
index 8daf23feb65f5..285b1c00d0e63 100644
--- a/unified-runtime/source/adapters/native_cpu/kernel.hpp
+++ b/unified-runtime/source/adapters/native_cpu/kernel.hpp
@@ -181,17 +181,44 @@ struct ur_kernel_handle_t_ : RefCounted {
     return Result;
   }
 
-  void addArg(const void *Ptr, size_t Index, size_t Size) {
+  inline ur_result_t addArg(const void *Ptr, size_t Index, size_t Size) {
+    UR_ASSERT(Size, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
     Args.addArg(Index, Size, Ptr);
+    return UR_RESULT_SUCCESS;
   }
 
-  void addPtrArg(void *Ptr, size_t Index) { Args.addPtrArg(Index, Ptr); }
+  inline ur_result_t addPtrArg(void *Ptr, size_t Index) {
+    UR_ASSERT(Ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+    Args.addPtrArg(Index, Ptr);
+    return UR_RESULT_SUCCESS;
+  }
 
   void addArgReference(ur_mem_handle_t Arg) {
     Arg->incrementReferenceCount();
     ReferencedArgs.push_back(Arg);
   }
 
+  inline ur_result_t addMemObjArg(ur_mem_handle_t ArgValue, size_t Index) {
+    // Taken from ur/adapters/cuda/kernel.cpp
+    // zero-sized buffers are expected to be null.
+    if (ArgValue == nullptr) {
+      addPtrArg(nullptr, Index);
+      return UR_RESULT_SUCCESS;
+    }
+
+    addArgReference(ArgValue);
+    addPtrArg(ArgValue->_mem, Index);
+    return UR_RESULT_SUCCESS;
+  }
+
+  inline ur_result_t addLocalArg(size_t Index, size_t Size) {
+    // emplace a placeholder kernel arg, gets replaced with a pointer to the
+    // memory pool before enqueueing the kernel.
+    Args.addPtrArg(Index, nullptr);
+    _localArgInfo.emplace_back(Index, Size);
+    return UR_RESULT_SUCCESS;
+  }
+
 private:
   void removeArgReferences() {
     for (auto arg : ReferencedArgs)
diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp
index 3f6fe061b4917..7bc2f999a00cd 100644
--- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -431,6 +431,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
   pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
   pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp;
   pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp;
+  pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp
index 3d419a30dc500..39f1696e9ea22 100644
--- a/unified-runtime/source/adapters/offload/enqueue.cpp
+++ b/unified-runtime/source/adapters/offload/enqueue.cpp
@@ -481,3 +481,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
 
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numArgs,
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    uint32_t numPropsInLaunchPropList,
+    const ur_kernel_launch_property_t *launchPropList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  for (uint32_t i = 0; i < numArgs; i++) {
+    switch (pArgs[i].type) {
+    case UR_EXP_KERNEL_ARG_TYPE_POINTER:
+      hKernel->Args.addArg(pArgs[i].index, sizeof(pArgs[i].value.pointer),
+                           &pArgs[i].value.pointer);
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_VALUE:
+      hKernel->Args.addArg(pArgs[i].index, pArgs[i].size, pArgs[i].value.value);
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ:
+      hKernel->Args.addMemObjArg(pArgs[i].index,
+                                 pArgs[i].value.memObjTuple.hMem,
+                                 pArgs[i].value.memObjTuple.flags);
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_LOCAL:
+    case UR_EXP_KERNEL_ARG_TYPE_SAMPLER:
+      return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    default:
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  }
+
+  return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numPropsInLaunchPropList, launchPropList,
+                               numEventsInWaitList, phEventWaitList, phEvent);
+}
diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp
index 1b81636729af7..91d547f8f3112 100644
--- a/unified-runtime/source/adapters/offload/kernel.cpp
+++ b/unified-runtime/source/adapters/offload/kernel.cpp
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "kernel.hpp"
-#include "memory.hpp"
 #include "program.hpp"
 #include "ur2offload.hpp"
 #include <OffloadAPI.h>
@@ -102,19 +101,11 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
                      const ur_kernel_arg_mem_obj_properties_t *Properties,
                      ur_mem_handle_t hArgValue) {
-  // Handle zero-sized buffers
-  if (hArgValue == nullptr) {
-    hKernel->Args.addArg(argIndex, 0, nullptr);
-    return UR_RESULT_SUCCESS;
-  }
-
   ur_mem_flags_t MemAccess =
       Properties ? Properties->memoryAccess
                  : static_cast<ur_mem_flags_t>(UR_MEM_FLAG_READ_WRITE);
   hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess);
 
-  auto Ptr = std::get<BufferMem>(hArgValue->Mem).Ptr;
-  hKernel->Args.addArg(argIndex, sizeof(void *), &Ptr);
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp
index 2794fc8f32213..88fc6d6591430 100644
--- a/unified-runtime/source/adapters/offload/kernel.hpp
+++ b/unified-runtime/source/adapters/offload/kernel.hpp
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "common.hpp"
+#include "memory.hpp"
 
 struct ur_kernel_handle_t_ : RefCounted {
 
@@ -56,7 +57,12 @@ struct ur_kernel_handle_t_ : RefCounted {
     }
 
     void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
-      assert(hMem && "Invalid mem handle");
+      // Handle zero-sized buffers
+      if (hMem == nullptr) {
+        addArg(Index, 0, nullptr);
+        return;
+      }
+
       // If a memobj is already set at this index, update the entry rather
       // than adding a duplicate one
       for (auto &Arg : MemObjArgs) {
@@ -66,6 +72,9 @@ struct ur_kernel_handle_t_ : RefCounted {
         }
       }
       MemObjArgs.push_back(MemObjArg{hMem, Index, Flags});
+
+      auto Ptr = std::get<BufferMem>(hMem->Mem).Ptr;
+      addArg(Index, sizeof(void *), &Ptr);
     }
 
     const args_ptr_t &getPointers() const noexcept { return Pointers; }
diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp
index fa3adfdaf92bd..0aa0bf4644a13 100644
--- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp
@@ -384,6 +384,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   pDdiTable->pfnTimestampRecordingExp = nullptr;
   pDdiTable->pfnNativeCommandExp = nullptr;
+  pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/unified-runtime/source/adapters/opencl/enqueue.cpp b/unified-runtime/source/adapters/opencl/enqueue.cpp
index 63b7b45426632..1bd75b6b56aaf 100644
--- a/unified-runtime/source/adapters/opencl/enqueue.cpp
+++ b/unified-runtime/source/adapters/opencl/enqueue.cpp
@@ -16,6 +16,10 @@
 #include "memory.hpp"
 #include "program.hpp"
 #include "queue.hpp"
+#include "sampler.hpp"
+
+#include <algorithm>
+#include <array>
 
 cl_map_flags convertURMapFlagsToCL(ur_map_flags_t URFlags) {
   cl_map_flags CLFlags = 0;
@@ -501,3 +505,102 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
 
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numArgs,
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    uint32_t numPropsInLaunchPropList,
+    const ur_kernel_launch_property_t *launchPropList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList;
+       propIndex++) {
+    // Adapters that don't support cooperative kernels are currently expected
+    // to ignore COOPERATIVE launch properties. Ideally we should avoid passing
+    // these at the SYCL RT level instead, see
+    // https://github.com/intel/llvm/issues/18421
+    if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE ||
+        launchPropList[propIndex].id ==
+            UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) {
+      continue;
+    }
+    return UR_RESULT_ERROR_INVALID_OPERATION;
+  }
+
+  clSetKernelArgMemPointerINTEL_fn SetKernelArgMemPointerPtr = nullptr;
+  UR_RETURN_ON_FAILURE(
+      cl_ext::getExtFuncFromContext<clSetKernelArgMemPointerINTEL_fn>(
+          hQueue->Context->CLContext,
+          ur::cl::getAdapter()->fnCache.clSetKernelArgMemPointerINTELCache,
+          cl_ext::SetKernelArgMemPointerName, &SetKernelArgMemPointerPtr));
+
+  for (uint32_t i = 0; i < numArgs; i++) {
+    switch (pArgs[i].type) {
+    case UR_EXP_KERNEL_ARG_TYPE_LOCAL:
+      CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel,
+                                          static_cast<cl_uint>(pArgs[i].index),
+                                          pArgs[i].size, nullptr));
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_VALUE:
+      CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel,
+                                          static_cast<cl_uint>(pArgs[i].index),
+                                          pArgs[i].size, pArgs[i].value.value));
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
+      cl_mem mem = pArgs[i].value.memObjTuple.hMem
+                       ? pArgs[i].value.memObjTuple.hMem->CLMemory
+                       : nullptr;
+      CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel,
+                                          static_cast<cl_uint>(pArgs[i].index),
+                                          pArgs[i].size, &mem));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_POINTER:
+      CL_RETURN_ON_FAILURE(SetKernelArgMemPointerPtr(
+          hKernel->CLKernel, static_cast<cl_uint>(pArgs[i].index),
+          pArgs[i].value.pointer));
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
+      CL_RETURN_ON_FAILURE(clSetKernelArg(
+          hKernel->CLKernel, static_cast<cl_uint>(pArgs[i].index),
+          pArgs[i].size, &pArgs[i].value.sampler->CLSampler));
+      break;
+    }
+    default:
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  }
+
+  std::vector<size_t> compiledLocalWorksize;
+  if (!pLocalWorkSize) {
+    cl_device_id device = nullptr;
+    CL_RETURN_ON_FAILURE(clGetCommandQueueInfo(
+        hQueue->CLQueue, CL_QUEUE_DEVICE, sizeof(device), &device, nullptr));
+    // This query always returns size_t[3], if nothing was specified it
+    // returns all zeroes.
+    size_t queriedLocalWorkSize[3] = {0, 0, 0};
+    CL_RETURN_ON_FAILURE(clGetKernelWorkGroupInfo(
+        hKernel->CLKernel, device, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
+        sizeof(size_t[3]), queriedLocalWorkSize, nullptr));
+    if (queriedLocalWorkSize[0] != 0) {
+      for (uint32_t i = 0; i < 3; i++) {
+        compiledLocalWorksize.push_back(queriedLocalWorkSize[i]);
+      }
+    }
+  }
+
+  cl_event Event;
+  std::vector<cl_event> CLWaitEvents(numEventsInWaitList);
+  MapUREventsToCL(numEventsInWaitList, phEventWaitList, CLWaitEvents);
+  CL_RETURN_ON_FAILURE(clEnqueueNDRangeKernel(
+      hQueue->CLQueue, hKernel->CLKernel, workDim, pGlobalWorkOffset,
+      pGlobalWorkSize,
+      compiledLocalWorksize.empty() ? pLocalWorkSize
+                                    : compiledLocalWorksize.data(),
+      numEventsInWaitList, CLWaitEvents.data(), ifUrEvent(phEvent, Event)));
+
+  UR_RETURN_ON_FAILURE(createUREvent(Event, hQueue->Context, hQueue, phEvent));
+  return UR_RESULT_SUCCESS;
+}
diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp
index c619fa36b1ab0..18cc6a79651be 100644
--- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp
@@ -434,6 +434,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
   pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
   pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp;
   pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp;
+  pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/unified-runtime/source/common/stype_map_helpers.def b/unified-runtime/source/common/stype_map_helpers.def
index 79705826395b9..efd69e6ae4cb3 100644
--- a/unified-runtime/source/common/stype_map_helpers.def
+++ b/unified-runtime/source/common/stype_map_helpers.def
@@ -168,3 +168,6 @@ struct stype_map<ur_exp_enqueue_native_command_properties_t>
 template <>
 struct stype_map<ur_exp_enqueue_ext_properties_t>
     : stype_map_impl<UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES> {};
+template <>
+struct stype_map<ur_exp_kernel_arg_properties_t>
+    : stype_map_impl<UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES> {};
diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index 012aa1422cfa1..7b8ebbd7ff36d 100644
--- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -1646,6 +1646,119 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo(
   return pfnGetInfo(hDevice, propName, propSize, pPropValue, pPropSizeRet);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueKernelLaunch
+ur_result_t urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch
+    /// properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution. If
+    /// nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional] return an event object that identifies this
+    /// particular kernel execution instance.
+    ur_event_handle_t *phEvent) {
+  // This mutex is to prevent concurrent kernel launches across different queues
+  // as the DeviceASAN local/private shadow memory does not support concurrent
+  // kernel launches now.
+  std::scoped_lock<ur_shared_mutex> Guard(
+      getAsanInterceptor()->KernelLaunchMutex);
+
+  UR_LOG_L(getContext()->logger, DEBUG,
+           "==== urEnqueueKernelLaunchWithArgsExp");
+
+  // We need to set all the args now rather than letting LaunchWithArgs handle
+  // them. This is because some implementations of
+  // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely
+  // on all the args being set.
+  for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) {
+    switch (pArgs[ArgPropIndex].type) {
+    case UR_EXP_KERNEL_ARG_TYPE_LOCAL: {
+      UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgLocal(
+          hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size,
+          nullptr));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_POINTER: {
+      UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgPointer(
+          hKernel, pArgs[ArgPropIndex].index, nullptr,
+          pArgs[ArgPropIndex].value.pointer));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_VALUE: {
+      UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgValue(
+          hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr,
+          pArgs[ArgPropIndex].value.value));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
+      ur_kernel_arg_mem_obj_properties_t Properties = {
+          UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr,
+          pArgs[ArgPropIndex].value.memObjTuple.flags};
+      UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgMemObj(
+          hKernel, pArgs[ArgPropIndex].index, &Properties,
+          pArgs[ArgPropIndex].value.memObjTuple.hMem));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
+      auto pfnKernelSetArgSampler =
+          getContext()->urDdiTable.Kernel.pfnSetArgSampler;
+      UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index,
+                                     nullptr,
+                                     pArgs[ArgPropIndex].value.sampler));
+      break;
+    }
+    default:
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  }
+
+  LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize,
+                        pLocalWorkSize, pGlobalWorkOffset, 3);
+  UR_CALL(LaunchInfo.Data.syncToDevice(hQueue));
+
+  UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo));
+
+  UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp(
+      hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      LaunchInfo.LocalWorkSize.data(), 0, nullptr, numPropsInLaunchPropList,
+      launchPropList, numEventsInWaitList, phEventWaitList, phEvent));
+
+  UR_CALL(getAsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo));
+
+  return UR_RESULT_SUCCESS;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Adapter table
 ///        with current process' addresses
@@ -1961,6 +2074,22 @@ __urdlllocal ur_result_t UR_APICALL urGetDeviceProcAddrTable(
 
   return result;
 }
+/// @brief Exported function for filling application's ProgramExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetEnqueueExpProcAddrTable(
+    /// [in,out] pointer to table of DDI function pointers
+    ur_enqueue_exp_dditable_t *pDdiTable) {
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  pDdiTable->pfnKernelLaunchWithArgsExp =
+      ur_sanitizer_layer::asan::urEnqueueKernelLaunchWithArgsExp;
+
+  return result;
+}
 
 template <class A, class B> struct NotSupportedApi;
 
@@ -2157,6 +2286,11 @@ ur_result_t initAsanDDITable(ur_dditable_t *dditable) {
         UR_API_VERSION_CURRENT, &dditable->VirtualMem);
   }
 
+  if (UR_RESULT_SUCCESS == result) {
+    result = ur_sanitizer_layer::asan::urGetEnqueueExpProcAddrTable(
+        &dditable->EnqueueExp);
+  }
+
   if (result != UR_RESULT_SUCCESS) {
     UR_LOG_L(getContext()->logger, ERR, "Initialize ASAN DDI table failed: {}",
              result);
diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp
index 810fd76120b2d..f9625b3b1d9e7 100644
--- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp
+++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp
@@ -1811,6 +1811,122 @@ ur_result_t urEnqueueUSMMemcpy2D(
   return UR_RESULT_SUCCESS;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueKernelLaunch
+ur_result_t urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch
+    /// properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution. If
+    /// nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional] return an event object that identifies this
+    /// particular kernel execution instance.
+    ur_event_handle_t *phEvent) {
+  // This mutex is to prevent concurrent kernel launches across different queues
+  // as the DeviceMSAN local/private shadow memory does not support concurrent
+  // kernel launches now.
+  std::scoped_lock<ur_shared_mutex> Guard(
+      getMsanInterceptor()->KernelLaunchMutex);
+
+  UR_LOG_L(getContext()->logger, DEBUG,
+           "==== urEnqueueKernelLaunchWithArgsExp");
+
+  // We need to set all the args now rather than letting LaunchWithArgs handle
+  // them. This is because some implementations of
+  // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely
+  // on all the args being set.
+  for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) {
+    switch (pArgs[ArgPropIndex].type) {
+    case UR_EXP_KERNEL_ARG_TYPE_LOCAL: {
+      UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgLocal(
+          hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size,
+          nullptr));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_POINTER: {
+      auto pfnKernelSetArgPointer =
+          getContext()->urDdiTable.Kernel.pfnSetArgPointer;
+      UR_CALL(pfnKernelSetArgPointer(hKernel, pArgs[ArgPropIndex].index,
+                                     nullptr,
+                                     pArgs[ArgPropIndex].value.pointer));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_VALUE: {
+      UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgValue(
+          hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr,
+          pArgs[ArgPropIndex].value.value));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
+      ur_kernel_arg_mem_obj_properties_t Properties = {
+          UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr,
+          pArgs[ArgPropIndex].value.memObjTuple.flags};
+      UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgMemObj(
+          hKernel, pArgs[ArgPropIndex].index, &Properties,
+          pArgs[ArgPropIndex].value.memObjTuple.hMem));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
+      auto pfnKernelSetArgSampler =
+          getContext()->urDdiTable.Kernel.pfnSetArgSampler;
+      UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index,
+                                     nullptr,
+                                     pArgs[ArgPropIndex].value.sampler));
+      break;
+    }
+    default:
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  }
+
+  USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue),
+                           pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
+                           3);
+  UR_CALL(LaunchInfo.initialize());
+
+  UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo));
+
+  UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp(
+      hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      LaunchInfo.LocalWorkSize.data(), 0, nullptr, numPropsInLaunchPropList,
+      launchPropList, numEventsInWaitList, phEventWaitList, phEvent));
+
+  UR_CALL(getMsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo));
+
+  return UR_RESULT_SUCCESS;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Adapter table
 ///        with current process' addresses
@@ -1989,6 +2105,22 @@ ur_result_t urGetUSMProcAddrTable(
 
   return result;
 }
+/// @brief Exported function for filling application's ProgramExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetEnqueueExpProcAddrTable(
+    /// [in,out] pointer to table of DDI function pointers
+    ur_enqueue_exp_dditable_t *pDdiTable) {
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  pDdiTable->pfnKernelLaunchWithArgsExp =
+      ur_sanitizer_layer::msan::urEnqueueKernelLaunchWithArgsExp;
+
+  return result;
+}
 
 ur_result_t urCheckVersion(ur_api_version_t version) {
   if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) !=
@@ -2054,6 +2186,11 @@ ur_result_t initMsanDDITable(ur_dditable_t *dditable) {
     result = ur_sanitizer_layer::msan::urGetUSMProcAddrTable(&dditable->USM);
   }
 
+  if (UR_RESULT_SUCCESS == result) {
+    result = ur_sanitizer_layer::msan::urGetEnqueueExpProcAddrTable(
+        &dditable->EnqueueExp);
+  }
+
   if (result != UR_RESULT_SUCCESS) {
     UR_LOG_L(getContext()->logger, ERR, "Initialize MSAN DDI table failed: {}",
              result);
diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp
index 61849ac0b363a..7934dbf596e40 100644
--- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp
+++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp
@@ -1337,6 +1337,120 @@ ur_result_t urEnqueueKernelLaunch(
   return UR_RESULT_SUCCESS;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueKernelLaunch
+ur_result_t urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch
+    /// properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution. If
+    /// nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional] return an event object that identifies this
+    /// particular kernel execution instance.
+    ur_event_handle_t *phEvent) {
+  // This mutex is to prevent concurrent kernel launches across different queues
+  // as the DeviceTSAN local shadow memory does not support concurrent
+  // kernel launches now.
+  std::scoped_lock<ur_shared_mutex> Guard(
+      getTsanInterceptor()->KernelLaunchMutex);
+
+  UR_LOG_L(getContext()->logger, DEBUG,
+           "==== urEnqueueKernelLaunchWithArgsExp");
+
+  // We need to set all the args now rather than letting LaunchWithArgs handle
+  // them. This is because some implementations of
+  // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely
+  // on all the args being set.
+  for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) {
+    switch (pArgs[ArgPropIndex].type) {
+    case UR_EXP_KERNEL_ARG_TYPE_LOCAL: {
+      UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgLocal(
+          hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size,
+          nullptr));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_POINTER: {
+      auto pfnKernelSetArgPointer =
+          getContext()->urDdiTable.Kernel.pfnSetArgPointer;
+      UR_CALL(pfnKernelSetArgPointer(hKernel, pArgs[ArgPropIndex].index,
+                                     nullptr,
+                                     pArgs[ArgPropIndex].value.pointer));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_VALUE: {
+      UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgValue(
+          hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr,
+          pArgs[ArgPropIndex].value.value));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
+      ur_kernel_arg_mem_obj_properties_t Properties = {
+          UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr,
+          pArgs[ArgPropIndex].value.memObjTuple.flags};
+      UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgMemObj(
+          hKernel, pArgs[ArgPropIndex].index, &Properties,
+          pArgs[ArgPropIndex].value.memObjTuple.hMem));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
+      auto pfnKernelSetArgSampler =
+          getContext()->urDdiTable.Kernel.pfnSetArgSampler;
+      UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index,
+                                     nullptr,
+                                     pArgs[ArgPropIndex].value.sampler));
+      break;
+    }
+    default:
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  }
+
+  LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize,
+                        pLocalWorkSize, pGlobalWorkOffset, 3);
+
+  UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo));
+
+  UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp(
+      hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      pLocalWorkSize, 0, nullptr, numPropsInLaunchPropList, launchPropList,
+      numEventsInWaitList, phEventWaitList, phEvent));
+
+  UR_CALL(getTsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo));
+
+  return UR_RESULT_SUCCESS;
+}
+
 ur_result_t urCheckVersion(ur_api_version_t version) {
   if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) !=
           UR_MAJOR_VERSION(version) ||
@@ -1547,6 +1661,22 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   return UR_RESULT_SUCCESS;
 }
 
+/// @brief Exported function for filling application's ProgramExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+ur_result_t urGetEnqueueExpProcAddrTable(
+    /// [in,out] pointer to table of DDI function pointers
+    ur_enqueue_exp_dditable_t *pDdiTable) {
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  pDdiTable->pfnKernelLaunchWithArgsExp =
+      ur_sanitizer_layer::tsan::urEnqueueKernelLaunchWithArgsExp;
+
+  return result;
+}
 } // namespace tsan
 
 ur_result_t initTsanDDITable(ur_dditable_t *dditable) {
@@ -1596,6 +1726,11 @@ ur_result_t initTsanDDITable(ur_dditable_t *dditable) {
         ur_sanitizer_layer::tsan::urGetEnqueueProcAddrTable(&dditable->Enqueue);
   }
 
+  if (UR_RESULT_SUCCESS == result) {
+    result = ur_sanitizer_layer::tsan::urGetEnqueueExpProcAddrTable(
+        &dditable->EnqueueExp);
+  }
+
   if (result != UR_RESULT_SUCCESS) {
     UR_LOG_L(getContext()->logger, ERR, "Initialize TSAN DDI table failed: {}",
              result);
diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp
index 80993d811a397..31e034e258239 100644
--- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp
@@ -10097,6 +10097,98 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
   return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list
+    /// of launch properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution.
+    /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional][alloc] return an event object that identifies this
+    /// particular kernel execution instance. If phEventWaitList and phEvent
+    /// are not NULL, phEvent must not refer to an element of the
+    /// phEventWaitList array.
+    ur_event_handle_t *phEvent) {
+  auto pfnKernelLaunchWithArgsExp =
+      getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp;
+
+  if (nullptr == pfnKernelLaunchWithArgsExp)
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+
+  ur_enqueue_kernel_launch_with_args_exp_params_t params = {
+      &hQueue,
+      &hKernel,
+      &workDim,
+      &pGlobalWorkOffset,
+      &pGlobalWorkSize,
+      &pLocalWorkSize,
+      &numArgs,
+      &pArgs,
+      &numPropsInLaunchPropList,
+      &launchPropList,
+      &numEventsInWaitList,
+      &phEventWaitList,
+      &phEvent};
+  uint64_t instance = getContext()->notify_begin(
+      UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP,
+      "urEnqueueKernelLaunchWithArgsExp", &params);
+
+  auto &logger = getContext()->logger;
+  UR_LOG_L(logger, INFO, "   ---> urEnqueueKernelLaunchWithArgsExp\n");
+
+  ur_result_t result = pfnKernelLaunchWithArgsExp(
+      hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList,
+      numEventsInWaitList, phEventWaitList, phEvent);
+
+  getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP,
+                           "urEnqueueKernelLaunchWithArgsExp", &params, &result,
+                           instance);
+
+  if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) {
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, &params);
+    UR_LOG_L(logger, INFO,
+             "   <--- urEnqueueKernelLaunchWithArgsExp({}) -> {};\n",
+             args_str.str(), result);
+  }
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt
 __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt(
@@ -10707,6 +10799,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   ur_result_t result = UR_RESULT_SUCCESS;
 
+  dditable.pfnKernelLaunchWithArgsExp = pDdiTable->pfnKernelLaunchWithArgsExp;
+  pDdiTable->pfnKernelLaunchWithArgsExp =
+      ur_tracing_layer::urEnqueueKernelLaunchWithArgsExp;
+
   dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp;
   pDdiTable->pfnUSMDeviceAllocExp =
       ur_tracing_layer::urEnqueueUSMDeviceAllocExp;
diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp
index 6b6afc878eead..8b1dc6c64e246 100644
--- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp
+++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp
@@ -10870,6 +10870,122 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
   return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list
+    /// of launch properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution.
+    /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional][alloc] return an event object that identifies this
+    /// particular kernel execution instance. If phEventWaitList and phEvent
+    /// are not NULL, phEvent must not refer to an element of the
+    /// phEventWaitList array.
+    ur_event_handle_t *phEvent) {
+  auto pfnKernelLaunchWithArgsExp =
+      getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp;
+
+  if (nullptr == pfnKernelLaunchWithArgsExp) {
+    return UR_RESULT_ERROR_UNINITIALIZED;
+  }
+
+  if (getContext()->enableParameterValidation) {
+    if (NULL == pGlobalWorkSize)
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    if (launchPropList == NULL && numPropsInLaunchPropList > 0)
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    if (pArgs == NULL && numArgs > 0)
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    if (NULL == hQueue)
+      return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+
+    if (NULL == hKernel)
+      return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+
+    if (NULL != pArgs && UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type)
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+
+    if (phEventWaitList == NULL && numEventsInWaitList > 0)
+      return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
+
+    if (phEventWaitList != NULL && numEventsInWaitList == 0)
+      return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
+
+    if (pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 ||
+        pGlobalWorkSize[2] == 0)
+      return UR_RESULT_ERROR_INVALID_WORK_DIMENSION;
+
+    if (pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == 0 ||
+                           pLocalWorkSize[2] == 0))
+      return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+
+    if (phEventWaitList != NULL && numEventsInWaitList > 0) {
+      for (uint32_t i = 0; i < numEventsInWaitList; ++i) {
+        if (phEventWaitList[i] == NULL) {
+          return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
+        }
+      }
+    }
+  }
+
+  if (getContext()->enableLifetimeValidation &&
+      !getContext()->refCountContext->isReferenceValid(hQueue)) {
+    URLOG_CTX_INVALID_REFERENCE(hQueue);
+  }
+
+  if (getContext()->enableLifetimeValidation &&
+      !getContext()->refCountContext->isReferenceValid(hKernel)) {
+    URLOG_CTX_INVALID_REFERENCE(hKernel);
+  }
+
+  ur_result_t result = pfnKernelLaunchWithArgsExp(
+      hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList,
+      numEventsInWaitList, phEventWaitList, phEvent);
+
+  if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS &&
+      phEvent) {
+    getContext()->refCountContext->createRefCount(*phEvent);
+  }
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt
 __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt(
@@ -11505,6 +11621,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   ur_result_t result = UR_RESULT_SUCCESS;
 
+  dditable.pfnKernelLaunchWithArgsExp = pDdiTable->pfnKernelLaunchWithArgsExp;
+  pDdiTable->pfnKernelLaunchWithArgsExp =
+      ur_validation_layer::urEnqueueKernelLaunchWithArgsExp;
+
   dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp;
   pDdiTable->pfnUSMDeviceAllocExp =
       ur_validation_layer::urEnqueueUSMDeviceAllocExp;
diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in
index e86a6c65a7957..d337292275873 100644
--- a/unified-runtime/source/loader/loader.def.in
+++ b/unified-runtime/source/loader/loader.def.in
@@ -76,6 +76,7 @@ EXPORTS
 	urEnqueueEventsWaitWithBarrier
 	urEnqueueEventsWaitWithBarrierExt
 	urEnqueueKernelLaunch
+	urEnqueueKernelLaunchWithArgsExp
 	urEnqueueMemBufferCopy
 	urEnqueueMemBufferCopyRect
 	urEnqueueMemBufferFill
@@ -288,6 +289,7 @@ EXPORTS
 	urPrintEnqueueEventsWaitWithBarrierExtParams
 	urPrintEnqueueEventsWaitWithBarrierParams
 	urPrintEnqueueKernelLaunchParams
+	urPrintEnqueueKernelLaunchWithArgsExpParams
 	urPrintEnqueueMemBufferCopyParams
 	urPrintEnqueueMemBufferCopyRectParams
 	urPrintEnqueueMemBufferFillParams
@@ -349,6 +351,9 @@ EXPORTS
 	urPrintExpImageCopyInputTypes
 	urPrintExpImageCopyRegion
 	urPrintExpImageMemType
+	urPrintExpKernelArgMemObjTuple
+	urPrintExpKernelArgProperties
+	urPrintExpKernelArgType
 	urPrintExpPeerInfo
 	urPrintExpSamplerAddrModes
 	urPrintExpSamplerCubemapFilterMode
diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in
index 6a30c9186f674..023032aecea01 100644
--- a/unified-runtime/source/loader/loader.map.in
+++ b/unified-runtime/source/loader/loader.map.in
@@ -76,6 +76,7 @@
 		urEnqueueEventsWaitWithBarrier;
 		urEnqueueEventsWaitWithBarrierExt;
 		urEnqueueKernelLaunch;
+		urEnqueueKernelLaunchWithArgsExp;
 		urEnqueueMemBufferCopy;
 		urEnqueueMemBufferCopyRect;
 		urEnqueueMemBufferFill;
@@ -288,6 +289,7 @@
 		urPrintEnqueueEventsWaitWithBarrierExtParams;
 		urPrintEnqueueEventsWaitWithBarrierParams;
 		urPrintEnqueueKernelLaunchParams;
+		urPrintEnqueueKernelLaunchWithArgsExpParams;
 		urPrintEnqueueMemBufferCopyParams;
 		urPrintEnqueueMemBufferCopyRectParams;
 		urPrintEnqueueMemBufferFillParams;
@@ -349,6 +351,9 @@
 		urPrintExpImageCopyInputTypes;
 		urPrintExpImageCopyRegion;
 		urPrintExpImageMemType;
+		urPrintExpKernelArgMemObjTuple;
+		urPrintExpKernelArgProperties;
+		urPrintExpKernelArgType;
 		urPrintExpPeerInfo;
 		urPrintExpSamplerAddrModes;
 		urPrintExpSamplerCubemapFilterMode;
diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp
index 80ff806b0e116..e0779fa823d49 100644
--- a/unified-runtime/source/loader/ur_ldrddi.cpp
+++ b/unified-runtime/source/loader/ur_ldrddi.cpp
@@ -5737,6 +5737,65 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
                                  pPropValue, pPropSizeRet);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list
+    /// of launch properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution.
+    /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional][alloc] return an event object that identifies this
+    /// particular kernel execution instance. If phEventWaitList and phEvent
+    /// are not NULL, phEvent must not refer to an element of the
+    /// phEventWaitList array.
+    ur_event_handle_t *phEvent) {
+
+  auto *dditable = *reinterpret_cast<ur_dditable_t **>(hQueue);
+
+  auto *pfnKernelLaunchWithArgsExp =
+      dditable->EnqueueExp.pfnKernelLaunchWithArgsExp;
+  if (nullptr == pfnKernelLaunchWithArgsExp)
+    return UR_RESULT_ERROR_UNINITIALIZED;
+
+  // forward to device-platform
+  return pfnKernelLaunchWithArgsExp(
+      hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList,
+      numEventsInWaitList, phEventWaitList, phEvent);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt
 __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt(
@@ -6247,6 +6306,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     if (ur_loader::getContext()->platforms.size() != 1 ||
         ur_loader::getContext()->forceIntercept) {
       // return pointers to loader's DDIs
+      pDdiTable->pfnKernelLaunchWithArgsExp =
+          ur_loader::urEnqueueKernelLaunchWithArgsExp;
       pDdiTable->pfnUSMDeviceAllocExp = ur_loader::urEnqueueUSMDeviceAllocExp;
       pDdiTable->pfnUSMSharedAllocExp = ur_loader::urEnqueueUSMSharedAllocExp;
       pDdiTable->pfnUSMHostAllocExp = ur_loader::urEnqueueUSMHostAllocExp;
diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp
index 85a085c8fb6bf..647f1607735f7 100644
--- a/unified-runtime/source/loader/ur_libapi.cpp
+++ b/unified-runtime/source/loader/ur_libapi.cpp
@@ -10580,6 +10580,104 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
   return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command to execute a kernel
+///
+/// @remarks
+///   _Analogues_
+///     - **clEnqueueNDRangeKernel**
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///         + `NULL == hKernel`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkSize`
+///         + `launchPropList == NULL && numPropsInLaunchPropList > 0`
+///         + `pArgs == NULL && numArgs > 0`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type`
+///     - ::UR_RESULT_ERROR_INVALID_QUEUE
+///     - ::UR_RESULT_ERROR_INVALID_KERNEL
+///     - ::UR_RESULT_ERROR_INVALID_EVENT
+///     - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+///         + `phEventWaitList == NULL && numEventsInWaitList > 0`
+///         + `phEventWaitList != NULL && numEventsInWaitList == 0`
+///         + If event objects in phEventWaitList are not valid events.
+///     - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS
+///         + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR.
+///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
+///         + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 ||
+///         pGlobalWorkSize[2] == 0`
+///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
+///         + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] ==
+///         0 || pLocalWorkSize[2] == 0)`
+///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///     - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values
+///     have not been specified."
+///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
+///     - ::UR_RESULT_ERROR_INVALID_OPERATION
+///         + If any property in `launchPropList` isn't supported by the device.
+ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list
+    /// of launch properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution.
+    /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional][alloc] return an event object that identifies this
+    /// particular kernel execution instance. If phEventWaitList and phEvent
+    /// are not NULL, phEvent must not refer to an element of the
+    /// phEventWaitList array.
+    ur_event_handle_t *phEvent) try {
+  auto pfnKernelLaunchWithArgsExp =
+      ur_lib::getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp;
+  if (nullptr == pfnKernelLaunchWithArgsExp)
+    return UR_RESULT_ERROR_UNINITIALIZED;
+
+  return pfnKernelLaunchWithArgsExp(
+      hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList,
+      numEventsInWaitList, phEventWaitList, phEvent);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Enqueue a barrier command which waits a list of events to complete
 ///        before it completes, with optional extended properties
diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp
index 06619c8f7f625..81e41cb849688 100644
--- a/unified-runtime/source/loader/ur_print.cpp
+++ b/unified-runtime/source/loader/ur_print.cpp
@@ -1147,6 +1147,30 @@ ur_result_t urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer,
   return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintExpKernelArgType(enum ur_exp_kernel_arg_type_t value,
+                                    char *buffer, const size_t buff_size,
+                                    size_t *out_size) {
+  std::stringstream ss;
+  ss << value;
+  return str_copy(&ss, buffer, buff_size, out_size);
+}
+
+ur_result_t urPrintExpKernelArgMemObjTuple(
+    const struct ur_exp_kernel_arg_mem_obj_tuple_t params, char *buffer,
+    const size_t buff_size, size_t *out_size) {
+  std::stringstream ss;
+  ss << params;
+  return str_copy(&ss, buffer, buff_size, out_size);
+}
+
+ur_result_t urPrintExpKernelArgProperties(
+    const struct ur_exp_kernel_arg_properties_t params, char *buffer,
+    const size_t buff_size, size_t *out_size) {
+  std::stringstream ss;
+  ss << params;
+  return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t urPrintExpEnqueueExtFlags(enum ur_exp_enqueue_ext_flag_t value,
                                       char *buffer, const size_t buff_size,
                                       size_t *out_size) {
@@ -1878,6 +1902,14 @@ ur_result_t urPrintEnqueueWriteHostPipeParams(
   return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintEnqueueKernelLaunchWithArgsExpParams(
+    const struct ur_enqueue_kernel_launch_with_args_exp_params_t *params,
+    char *buffer, const size_t buff_size, size_t *out_size) {
+  std::stringstream ss;
+  ss << params;
+  return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t urPrintEnqueueEventsWaitWithBarrierExtParams(
     const struct ur_enqueue_events_wait_with_barrier_ext_params_t *params,
     char *buffer, const size_t buff_size, size_t *out_size) {
diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp
index ebcdbe1a7882d..5bfb2bb96b51b 100644
--- a/unified-runtime/source/ur_api.cpp
+++ b/unified-runtime/source/ur_api.cpp
@@ -9210,6 +9210,95 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
   return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command to execute a kernel
+///
+/// @remarks
+///   _Analogues_
+///     - **clEnqueueNDRangeKernel**
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///         + `NULL == hKernel`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkSize`
+///         + `launchPropList == NULL && numPropsInLaunchPropList > 0`
+///         + `pArgs == NULL && numArgs > 0`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type`
+///     - ::UR_RESULT_ERROR_INVALID_QUEUE
+///     - ::UR_RESULT_ERROR_INVALID_KERNEL
+///     - ::UR_RESULT_ERROR_INVALID_EVENT
+///     - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+///         + `phEventWaitList == NULL && numEventsInWaitList > 0`
+///         + `phEventWaitList != NULL && numEventsInWaitList == 0`
+///         + If event objects in phEventWaitList are not valid events.
+///     - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS
+///         + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR.
+///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
+///         + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 ||
+///         pGlobalWorkSize[2] == 0`
+///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
+///         + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] ==
+///         0 || pLocalWorkSize[2] == 0)`
+///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///     - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values
+///     have not been specified."
+///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
+///     - ::UR_RESULT_ERROR_INVALID_OPERATION
+///         + If any property in `launchPropList` isn't supported by the device.
+ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] handle of the kernel object
+    ur_kernel_handle_t hKernel,
+    /// [in] number of dimensions, from 1 to 3, to specify the global and
+    /// work-group work-items
+    uint32_t workDim,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of workDim unsigned values that specify the
+    /// number of global work-items in workDim that will execute the kernel
+    /// function
+    const size_t *pGlobalWorkSize,
+    /// [in][optional] pointer to an array of workDim unsigned values that
+    /// specify the number of local work-items forming a work-group that will
+    /// execute the kernel function.
+    /// If nullptr, the runtime implementation will choose the work-group size.
+    const size_t *pLocalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [in] size of the launch prop list
+    uint32_t numPropsInLaunchPropList,
+    /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list
+    /// of launch properties
+    const ur_kernel_launch_property_t *launchPropList,
+    /// [in] size of the event wait list
+    uint32_t numEventsInWaitList,
+    /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    /// events that must be complete before the kernel execution.
+    /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait
+    /// event.
+    const ur_event_handle_t *phEventWaitList,
+    /// [out][optional][alloc] return an event object that identifies this
+    /// particular kernel execution instance. If phEventWaitList and phEvent
+    /// are not NULL, phEvent must not refer to an element of the
+    /// phEventWaitList array.
+    ur_event_handle_t *phEvent) {
+  ur_result_t result = UR_RESULT_SUCCESS;
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Enqueue a barrier command which waits a list of events to complete
 ///        before it completes, with optional extended properties
diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt
index 5d579dbbf506b..c1ca49f8e992a 100644
--- a/unified-runtime/test/conformance/CMakeLists.txt
+++ b/unified-runtime/test/conformance/CMakeLists.txt
@@ -79,6 +79,7 @@ set(TEST_SUBDIRECTORIES_DPCXX
     "integration"
     "exp_command_buffer"
     "exp_enqueue_native"
+    "exp_enqueue_kernel_launch_with_args"
     "exp_usm_p2p"
     "memory-migrate"
     "usm"
diff --git a/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt b/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt
new file mode 100644
index 0000000000000..d03e5ef7c072f
--- /dev/null
+++ b/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright (C) 2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+add_conformance_kernels_test(
+    exp_kernel_launch_with_args
+    urEnqueueKernelLaunchWithArgsExp.cpp
+)
diff --git a/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp b/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp
new file mode 100644
index 0000000000000..093bc56004e9a
--- /dev/null
+++ b/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp
@@ -0,0 +1,303 @@
+// Copyright (C) 2025 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <uur/fixtures.h>
+
+#include <cstring>
+
+// This test runs a kernel with a mix of local memory, pointer and value args.
+struct urEnqueueKernelLaunchWithArgsTest : uur::urKernelExecutionTest {
+  void SetUp() override {
+    program_name = "saxpy_usm_local_mem";
+    UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+
+    ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+                                     sizeof(backend), &backend, nullptr));
+
+    // HIP has extra args for local memory so we define an offset for arg
+    // indices here for updating
+    hip_arg_offset = backend == UR_BACKEND_HIP ? 3 : 0;
+    ur_device_usm_access_capability_flags_t shared_usm_flags;
+    ASSERT_SUCCESS(
+        uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+    if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+      GTEST_SKIP() << "Shared USM is not supported.";
+    }
+
+    const size_t allocation_size =
+        sizeof(uint32_t) * global_size[0] * local_size[0];
+    for (auto &shared_ptr : shared_ptrs) {
+      ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+                                      allocation_size, &shared_ptr));
+      ASSERT_NE(shared_ptr, nullptr);
+
+      std::vector<uint8_t> pattern(allocation_size);
+      uur::generateMemFillPattern(pattern);
+      std::memcpy(shared_ptr, pattern.data(), allocation_size);
+    }
+    uint32_t current_index = 0;
+    // Index 0 is local_mem_a arg
+    args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES,
+                    nullptr,
+                    UR_EXP_KERNEL_ARG_TYPE_LOCAL,
+                    current_index++,
+                    local_mem_a_size,
+                    {nullptr}});
+
+    // Hip has extra args for local mem at index 1-3
+    ur_exp_kernel_arg_value_t argValue = {};
+    if (backend == UR_BACKEND_HIP) {
+      argValue.value = &hip_local_offset;
+      ur_exp_kernel_arg_properties_t local_offset = {
+          UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES,
+          nullptr,
+          UR_EXP_KERNEL_ARG_TYPE_VALUE,
+          current_index++,
+          sizeof(hip_local_offset),
+          argValue};
+      args.push_back(local_offset);
+      local_offset.index = current_index++;
+      args.push_back(local_offset);
+      local_offset.index = current_index++;
+      args.push_back(local_offset);
+    }
+
+    // Index 1 is local_mem_b arg
+    args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES,
+                    nullptr,
+                    UR_EXP_KERNEL_ARG_TYPE_LOCAL,
+                    current_index++,
+                    local_mem_b_size,
+                    {nullptr}});
+
+    if (backend == UR_BACKEND_HIP) {
+      argValue.value = &hip_local_offset;
+      ur_exp_kernel_arg_properties_t local_offset = {
+          UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES,
+          nullptr,
+          UR_EXP_KERNEL_ARG_TYPE_VALUE,
+          current_index++,
+          sizeof(hip_local_offset),
+          argValue};
+      args.push_back(local_offset);
+      local_offset.index = current_index++;
+      args.push_back(local_offset);
+      local_offset.index = current_index++;
+      args.push_back(local_offset);
+    }
+
+    // Index 2 is output
+    argValue.pointer = shared_ptrs[0];
+    args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                    UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++,
+                    sizeof(shared_ptrs[0]), argValue});
+    // Index 3 is A
+    argValue.value = &A;
+    args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                    UR_EXP_KERNEL_ARG_TYPE_VALUE, current_index++, sizeof(A),
+                    argValue});
+    // Index 4 is X
+    argValue.pointer = shared_ptrs[1];
+    args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                    UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++,
+                    sizeof(shared_ptrs[1]), argValue});
+    // Index 5 is Y
+    argValue.pointer = shared_ptrs[2];
+    args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
+                    UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++,
+                    sizeof(shared_ptrs[2]), argValue});
+  }
+
+  void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A,
+                size_t length, size_t local_size) {
+    for (size_t i = 0; i < length; i++) {
+      uint32_t result = A * X[i] + Y[i] + local_size;
+      ASSERT_EQ(result, output[i]);
+    }
+  }
+
+  virtual void TearDown() override {
+    for (auto &shared_ptr : shared_ptrs) {
+      if (shared_ptr) {
+        EXPECT_SUCCESS(urUSMFree(context, shared_ptr));
+      }
+    }
+
+    UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown());
+  }
+
+  static constexpr size_t local_size[3] = {4, 1, 1};
+  static constexpr size_t local_mem_a_size = local_size[0] * sizeof(uint32_t);
+  static constexpr size_t local_mem_b_size = local_mem_a_size * 2;
+  static constexpr size_t global_size[3] = {16, 1, 1};
+  static constexpr size_t global_offset[3] = {0, 0, 0};
+  static constexpr uint32_t workDim = 3;
+  static constexpr uint32_t A = 42;
+  std::array<void *, 5> shared_ptrs = {nullptr, nullptr, nullptr, nullptr,
+                                       nullptr};
+
+  uint32_t hip_arg_offset = 0;
+  static constexpr uint64_t hip_local_offset = 0;
+  ur_backend_t backend{};
+  std::vector<ur_exp_kernel_arg_properties_t> args;
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchWithArgsTest);
+
+TEST_P(urEnqueueKernelLaunchWithArgsTest, Success) {
+  ASSERT_SUCCESS(urEnqueueKernelLaunchWithArgsExp(
+      queue, kernel, workDim, global_offset, global_size, local_size,
+      args.size(), args.data(), 0, nullptr, 0, nullptr, nullptr));
+  ASSERT_SUCCESS(urQueueFinish(queue));
+
+  uint32_t *output = (uint32_t *)shared_ptrs[0];
+  uint32_t *X = (uint32_t *)shared_ptrs[1];
+  uint32_t *Y = (uint32_t *)shared_ptrs[2];
+  Validate(output, X, Y, A, global_size[0], local_size[0]);
+}
+
+TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullHandleQueue) {
+  ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
+                   urEnqueueKernelLaunchWithArgsExp(
+                       nullptr, kernel, workDim, global_offset, global_size,
+                       local_size, args.size(), args.data(), 0, nullptr, 0,
+                       nullptr, nullptr));
+}
+
+TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullHandleKernel) {
+  ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
+                   urEnqueueKernelLaunchWithArgsExp(
+                       queue, nullptr, workDim, global_offset, global_size,
+                       local_size, args.size(), args.data(), 0, nullptr, 0,
+                       nullptr, nullptr));
+}
+
+TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerGlobalSize) {
+  ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                   urEnqueueKernelLaunchWithArgsExp(
+                       queue, kernel, workDim, global_offset, nullptr,
+                       local_size, args.size(), args.data(), 0, nullptr, 0,
+                       nullptr, nullptr));
+}
+
+TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerProperties) {
+  ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                   urEnqueueKernelLaunchWithArgsExp(
+                       queue, kernel, workDim, global_offset, global_size,
+                       local_size, args.size(), args.data(), 1, nullptr, 0,
+                       nullptr, nullptr));
+}
+
+TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerArgs) {
+  ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                   urEnqueueKernelLaunchWithArgsExp(
+                       queue, kernel, workDim, global_offset, global_size,
+                       local_size, args.size(), nullptr, 0, nullptr, 0, nullptr,
+                       nullptr));
+}
+
+TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidEventWaitList) {
+  ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST,
+                   urEnqueueKernelLaunchWithArgsExp(
+                       queue, kernel, workDim, global_offset, global_size,
+                       local_size, args.size(), args.data(), 0, nullptr, 1,
+                       nullptr, nullptr));
+  ur_event_handle_t event = nullptr;
+  ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST,
+                   urEnqueueKernelLaunchWithArgsExp(
+                       queue, kernel, workDim, global_offset, global_size,
+                       local_size, args.size(), args.data(), 0, nullptr, 0,
+                       &event, nullptr));
+}
+
+// This test runs a kernel with a buffer (MEM_OBJ) arg.
+struct urEnqueueKernelLaunchWithArgsMemObjTest : uur::urKernelExecutionTest {
+  void SetUp() override {
+    program_name = "fill";
+    UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+
+    ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+                                     sizeof(backend), &backend, nullptr));
+
+    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE,
+                                     sizeof(val) * global_size[0], nullptr,
+                                     &buffer));
+
+    char zero = 0;
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffer, &zero, sizeof(zero), 0,
+                                          buffer_size, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // First argument is buffer to fill
+    unsigned current_arg_index = 0;
+    ur_exp_kernel_arg_mem_obj_tuple_t buffer_and_properties = {buffer, 0};
+    ur_exp_kernel_arg_properties_t arg = {
+        UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES,
+        nullptr,
+        UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ,
+        current_arg_index++,
+        sizeof(buffer),
+        {nullptr}};
+    arg.value.memObjTuple = buffer_and_properties;
+    args.push_back(arg);
+
+    // Add accessor arguments depending on backend.
+    // HIP has 3 offset parameters and other backends only have 1.
+    if (backend == UR_BACKEND_HIP) {
+      arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE;
+      arg.size = sizeof(hip_local_offset);
+      arg.value.value = &hip_local_offset;
+      arg.index = current_arg_index++;
+      args.push_back(arg);
+      arg.index = current_arg_index++;
+      args.push_back(arg);
+      arg.index = current_arg_index++;
+      args.push_back(arg);
+    } else {
+      arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE;
+      arg.index = current_arg_index++;
+      arg.size = sizeof(accessor);
+      arg.value.value = &accessor;
+      args.push_back(arg);
+    }
+
+    // Second user defined argument is scalar to fill with.
+    arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE;
+    arg.index = current_arg_index++;
+    arg.size = sizeof(val);
+    arg.value.value = &val;
+    args.push_back(arg);
+  }
+
+  void TearDown() override {
+    if (buffer) {
+      EXPECT_SUCCESS(urMemRelease(buffer));
+    }
+
+    UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown());
+  }
+
+  static constexpr uint32_t val = 42;
+  static constexpr size_t global_size[3] = {32, 1, 1};
+  static constexpr uint32_t workDim = 3;
+  static constexpr size_t buffer_size = sizeof(val) * global_size[0];
+  static constexpr uint64_t hip_local_offset = 0;
+  ur_backend_t backend{};
+  ur_mem_handle_t buffer = nullptr;
+  // This is the accessor offset struct sycl kernels expect to accompany buffer args.
+  struct {
+    size_t offsets[1] = {0};
+  } accessor;
+  std::vector<ur_exp_kernel_arg_properties_t> args;
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchWithArgsMemObjTest);
+
+TEST_P(urEnqueueKernelLaunchWithArgsMemObjTest, Success) {
+  ASSERT_SUCCESS(urEnqueueKernelLaunchWithArgsExp(
+      queue, kernel, workDim, nullptr, global_size, nullptr, args.size(),
+      args.data(), 0, nullptr, 0, nullptr, nullptr));
+  ASSERT_SUCCESS(urQueueFinish(queue));
+  ValidateBuffer(buffer, buffer_size, val);
+}