diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index e2e62cfcd531b..233091469be88 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2468,14 +2468,16 @@ static ur_result_t SetKernelParamsAndLaunch( /* pPropSizeRet = */ nullptr); const bool EnforcedLocalSize = - (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 || - RequiredWGSize[2] != 0); + (RequiredWGSize[0] != 0 && + (NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) && + (NDRDesc.Dims < 3 || RequiredWGSize[2] != 0)); if (EnforcedLocalSize) LocalSize = RequiredWGSize; } - const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 || - NDRDesc.GlobalOffset[1] != 0 || - NDRDesc.GlobalOffset[2] != 0; + + const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 && + (NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) && + (NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0); std::vector property_list; @@ -2610,6 +2612,10 @@ ur_result_t enqueueImpCommandBufferKernel( size_t RequiredWGSize[3] = {0, 0, 0}; size_t *LocalSize = nullptr; + const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 && + (NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) && + (NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0); + if (HasLocalSize) LocalSize = &NDRDesc.LocalSize[0]; else { @@ -2620,8 +2626,9 @@ ur_result_t enqueueImpCommandBufferKernel( /* pPropSizeRet = */ nullptr); const bool EnforcedLocalSize = - (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 || - RequiredWGSize[2] != 0); + (RequiredWGSize[0] != 0 && + (NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) && + (NDRDesc.Dims < 3 || RequiredWGSize[2] != 0)); if (EnforcedLocalSize) LocalSize = RequiredWGSize; } @@ -2637,7 +2644,8 @@ ur_result_t enqueueImpCommandBufferKernel( ur_result_t Res = Adapter.call_nocheck( - CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], + CommandBuffer, UrKernel, NDRDesc.Dims, + HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(), AltUrKernels.size() ? AltUrKernels.data() : nullptr, SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0, diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 2f3a26a9b988e..2459b5b36647d 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -11210,7 +11210,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferFinalizeExp( /// + `NULL == hCommandBuffer` /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP /// - ::UR_RESULT_ERROR_INVALID_KERNEL @@ -11244,7 +11243,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_kernel_handle_t hKernel, /// [in] Dimension of the kernel execution. uint32_t workDim, - /// [in] Offset to use when executing kernel. + /// [in][optional] Offset to use when executing kernel. const size_t *pGlobalWorkOffset, /// [in] Global work size to use when executing kernel. const size_t *pGlobalWorkSize, diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index a194777f9e40b..432354b89f5e7 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -353,7 +353,7 @@ params: desc: "[in] Dimension of the kernel execution." - type: "const size_t*" name: pGlobalWorkOffset - desc: "[in] Offset to use when executing kernel." + desc: "[in][optional] Offset to use when executing kernel." - type: "const size_t*" name: pGlobalWorkSize desc: "[in] Global work size to use when executing kernel." diff --git a/unified-runtime/source/adapters/cuda/command_buffer.cpp b/unified-runtime/source/adapters/cuda/command_buffer.cpp index 19082b8947b4b..c33719be3c540 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.cpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.cpp @@ -109,8 +109,15 @@ kernel_command_data::kernel_command_data( ur_kernel_handle_t *KernelAlternatives) : Kernel(Kernel), Params(Params), WorkDim(WorkDim) { const size_t CopySize = sizeof(size_t) * WorkDim; - std::memcpy(GlobalWorkOffset, GlobalWorkOffsetPtr, CopySize); std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize); + + // GlobalWorkOffsetPtr may be nullptr + if (GlobalWorkOffsetPtr) { + std::memcpy(GlobalWorkOffset, GlobalWorkOffsetPtr, CopySize); + } else { + std::memset(GlobalWorkOffset, 0, sizeof(size_t) * 3); + } + // Local work size may be nullptr if (LocalWorkSizePtr) { std::memcpy(LocalWorkSize, LocalWorkSizePtr, CopySize); diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index abac82900fc82..c75375dd80bc4 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -55,8 +55,15 @@ ur_exp_command_buffer_command_handle_t_:: : handle_base(), CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params), WorkDim(WorkDim) { const size_t CopySize = sizeof(size_t) * WorkDim; - std::memcpy(GlobalWorkOffset, GlobalWorkOffsetPtr, CopySize); std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize); + + // GlobalWorkOffsetPtr may be nullptr + if (GlobalWorkOffsetPtr) { + std::memcpy(GlobalWorkOffset, GlobalWorkOffsetPtr, CopySize); + } else { + std::memset(GlobalWorkOffset, 0, sizeof(size_t) * 3); + } + // Local work size may be nullptr if (LocalWorkSizePtr) { std::memcpy(LocalWorkSize, LocalWorkSizePtr, CopySize); diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index 7956f048db92e..2e9a897fa9128 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -9681,7 +9681,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_kernel_handle_t hKernel, /// [in] Dimension of the kernel execution. uint32_t workDim, - /// [in] Offset to use when executing kernel. + /// [in][optional] Offset to use when executing kernel. const size_t *pGlobalWorkOffset, /// [in] Global work size to use when executing kernel. const size_t *pGlobalWorkSize, diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index d096d3895c385..9a4d63848ece6 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -8167,7 +8167,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_kernel_handle_t hKernel, /// [in] Dimension of the kernel execution. uint32_t workDim, - /// [in] Offset to use when executing kernel. + /// [in][optional] Offset to use when executing kernel. const size_t *pGlobalWorkOffset, /// [in] Global work size to use when executing kernel. const size_t *pGlobalWorkSize, diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index 32dec6f1b25df..93ae5385ab541 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -8916,7 +8916,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_kernel_handle_t hKernel, /// [in] Dimension of the kernel execution. uint32_t workDim, - /// [in] Offset to use when executing kernel. + /// [in][optional] Offset to use when executing kernel. const size_t *pGlobalWorkOffset, /// [in] Global work size to use when executing kernel. const size_t *pGlobalWorkSize, @@ -8961,9 +8961,6 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( } if (getContext()->enableParameterValidation) { - if (NULL == pGlobalWorkOffset) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - if (NULL == pGlobalWorkSize) return UR_RESULT_ERROR_INVALID_NULL_POINTER; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 75ae04bc5a4a8..e63ead3d0ae37 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -4628,7 +4628,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_kernel_handle_t hKernel, /// [in] Dimension of the kernel execution. uint32_t workDim, - /// [in] Offset to use when executing kernel. + /// [in][optional] Offset to use when executing kernel. const size_t *pGlobalWorkOffset, /// [in] Global work size to use when executing kernel. const size_t *pGlobalWorkSize, diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 64967af8acc02..dbdeaf8173f04 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -8630,7 +8630,6 @@ ur_result_t UR_APICALL urCommandBufferFinalizeExp( /// + `NULL == hCommandBuffer` /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP /// - ::UR_RESULT_ERROR_INVALID_KERNEL @@ -8664,7 +8663,7 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_kernel_handle_t hKernel, /// [in] Dimension of the kernel execution. uint32_t workDim, - /// [in] Offset to use when executing kernel. + /// [in][optional] Offset to use when executing kernel. const size_t *pGlobalWorkOffset, /// [in] Global work size to use when executing kernel. const size_t *pGlobalWorkSize, diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 9afdebdbae42f..eee9fde9f3d5f 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -7513,7 +7513,6 @@ ur_result_t UR_APICALL urCommandBufferFinalizeExp( /// + `NULL == hCommandBuffer` /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP /// - ::UR_RESULT_ERROR_INVALID_KERNEL @@ -7547,7 +7546,7 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_kernel_handle_t hKernel, /// [in] Dimension of the kernel execution. uint32_t workDim, - /// [in] Offset to use when executing kernel. + /// [in][optional] Offset to use when executing kernel. const size_t *pGlobalWorkOffset, /// [in] Global work size to use when executing kernel. const size_t *pGlobalWorkSize,