diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index a774f2291a65d..1b81e68e408aa 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -28,30 +28,26 @@ document for details of support of different SYCL backends. ### UR Command-Buffer Experimental Feature The command-buffer concept has been introduced to UR as an -[experimental feature](https://oneapi-src.github.io/unified-runtime/core/api.html#command-buffer-experimental) -with the following entry-points: - -| Function | Description | -| -------------------------------------------- | ----------- | -| `urCommandBufferCreateExp` | Create a command-buffer. | -| `urCommandBufferRetainExp` | Incrementing reference count of command-buffer. | -| `urCommandBufferReleaseExp` | Decrementing reference count of command-buffer. | -| `urCommandBufferFinalizeExp` | No more commands can be appended, makes command-buffer ready to enqueue on a command-queue. | -| `urCommandBufferAppendKernelLaunchExp` | Append a kernel execution command to command-buffer. | -| `urCommandBufferAppendUSMMemcpyExp` | Append a USM memcpy command to the command-buffer. | -| `urCommandBufferAppendUSMFillExp` | Append a USM fill command to the command-buffer. | -| `urCommandBufferAppendMemBufferCopyExp` | Append a mem buffer copy command to the command-buffer. | -| `urCommandBufferAppendMemBufferWriteExp` | Append a memory write command to a command-buffer object. | -| `urCommandBufferAppendMemBufferReadExp` | Append a memory read command to a command-buffer object. | -| `urCommandBufferAppendMemBufferCopyRectExp` | Append a rectangular memory copy command to a command-buffer object. | -| `urCommandBufferAppendMemBufferWriteRectExp` | Append a rectangular memory write command to a command-buffer object. | -| `urCommandBufferAppendMemBufferReadRectExp` | Append a rectangular memory read command to a command-buffer object. | -| `urCommandBufferAppendMemBufferFillExp` | Append a memory fill command to a command-buffer object. | -| `urEnqueueCommandBufferExp` | Submit command-buffer to a command-queue for execution. | -| `urCommandBufferUpdateKernelLaunchExp` | Updates the parameters of a previous kernel launch command. | - +[experimental feature](https://oneapi-src.github.io/unified-runtime/core/api.html#command-buffer-experimental). See the [UR EXP-COMMAND-BUFFER](https://oneapi-src.github.io/unified-runtime/core/EXP-COMMAND-BUFFER.html) -specification for more details. +specification for details. + +Device support for SYCL-Graph is communicated to the user via two aspects. +The `aspect::ext_oneapi_limited_graph` aspect for basic graph support and +the `aspect::ext_oneapi_graph` aspect for full graph support. + +The `UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP` query result is used by the +SYCL-RT to inform whether to report `aspect::ext_oneapi_limited_graph`. + +Reporting of the `aspect::ext_oneapi_graph` aspect is based on the +`UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP` query result. For +a device to report this aspect, the UR query must report support for all of: + +* `UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS` +* `UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE` +* `UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE` +* `UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET` +* `UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE` ## Design @@ -608,43 +604,14 @@ SYCL-Graph is only enabled for an OpenCL backend when the extension is available, however this information isn't available until runtime due to OpenCL implementations being loaded through an ICD. -The `ur_exp_command_buffer` string is conditionally returned from the OpenCL -command-buffer UR backend at runtime based on `cl_khr_command_buffer` support -to indicate that the graph extension should be enabled. This is information -is propagated to the SYCL user via the -`device.get_info()` query for graph extension -support. - -#### Limitations - -Due to the API mapping gaps documented in the following section, OpenCL as a -SYCL backend cannot fully support the graph API. Instead, there are -limitations in the types of nodes which a user can add to a graph, using -an unsupported node type will cause a SYCL exception to be thrown in graph -finalization with error code `sycl::errc::feature_not_supported` and a message -mentioning the unsupported command. For example, - -``` -terminate called after throwing an instance of 'sycl::_V1::exception' -what(): USM copy command not supported by graph backend -``` - -The types of commands which are unsupported, and lead to this exception are: -* `handler::copy(src, dest)` - Where `src` is an accessor and `dest` is a pointer. - This corresponds to a memory buffer read command. -* `handler::copy(src, dest)` - Where `src` is an pointer and `dest` is an accessor. - This corresponds to a memory buffer write command. -* `handler::copy(src, dest)` or `handler::memcpy(dest, src)` - Where both `src` and - `dest` are USM pointers. This corresponds to a USM copy command. -* `handler::fill(ptr, pattern, count)` - This corresponds to a USM memory - fill command. -* `handler::memset(ptr, value, numBytes)` - This corresponds to a USM memory - fill command. -* `handler::prefetch()`. -* `handler::mem_advise()`. - -Note that `handler::copy(src, dest)` where both `src` and `dest` are an accessor -is supported, as a memory buffer copy command exists in the OpenCL extension. +The `UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP` UR query returns true in the +OpenCL UR adapter based on +the presence of `cl_khr_command_buffer`, and the OpenCL device reporting +support for +[CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR](https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR). +The latter is required to enable multiple submissions of the same executable +`command_graph` object without having to do a blocking wait on prior submissions +in-between. #### UR API Mapping @@ -678,18 +645,56 @@ adapter where there is matching support for each function in the list. | | clGetCommandBufferInfoKHR | No | | | clCommandSVMMemcpyKHR | No | | | clCommandSVMMemFillKHR | No | -| urCommandBufferUpdateKernelLaunchExp | clUpdateMutableCommandsKHR | Yes[1] | +| urCommandBufferUpdateKernelLaunchExp | clUpdateMutableCommandsKHR | Partial [See Update Section](#update-support) | We are looking to address these gaps in the future so that SYCL-Graph can be fully supported on a `cl_khr_command_buffer` backend. -[1] Support for `urCommandBufferUpdateKernelLaunchExp` used to update the +#### Unsupported Command Types + +Due to the API mapping gaps documented in the previous section, OpenCL as a +SYCL backend cannot fully support the graph API. Instead, there are +limitations in the types of nodes which a user can add to a graph, using +an unsupported node type will cause a SYCL exception to be thrown in graph +finalization with error code `sycl::errc::feature_not_supported` and a message +mentioning the unsupported command. For example, + +``` +terminate called after throwing an instance of 'sycl::_V1::exception' +what(): USM copy command not supported by graph backend +``` + +The types of commands which are unsupported, and lead to this exception are: +* `handler::copy(src, dest)` - Where `src` is an accessor and `dest` is a pointer. + This corresponds to a memory buffer read command. +* `handler::copy(src, dest)` - Where `src` is an pointer and `dest` is an accessor. + This corresponds to a memory buffer write command. +* `handler::copy(src, dest)` or `handler::memcpy(dest, src)` - Where both `src` and + `dest` are USM pointers. This corresponds to a USM copy command. +* `handler::fill(ptr, pattern, count)` - This corresponds to a USM memory + fill command. +* `handler::memset(ptr, value, numBytes)` - This corresponds to a USM memory + fill command. +* `handler::prefetch()`. +* `handler::mem_advise()`. + +Note that `handler::copy(src, dest)` where both `src` and `dest` are an accessor +is supported, as a memory buffer copy command exists in the OpenCL extension. + +#### Update Support + +Support for `urCommandBufferUpdateKernelLaunchExp` used to update the configuration of kernel commands requires an OpenCL implementation with the [cl_khr_command_buffer_mutable_dispatch](https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#cl_khr_command_buffer_mutable_dispatch) -extension. The optional capabilities that are reported by this extension must -include all of of `CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR`, -`CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR`, `CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR`, -`CL_MUTABLE_DISPATCH_ARGUMENTS_KHR`, and `CL_MUTABLE_DISPATCH_EXEC_INFO_KHR`. +extension. + +However, the OpenCL adapter can not report `aspect::ext_oneapi_graph` for full +SYCL-Graph support. As the `cl_khr_command_buffer_mutable_dispatch` extension +has no concept of updating the `cl_kernel` objects in kernel commands, and so +can't report the +`UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE` capability. +This extension limitation is tracked in by the OpenCL Working Group in an +[OpenCL-Docs Issue](https://github.com/KhronosGroup/OpenCL-Docs/issues/1279). #### UR Command-Buffer Implementation diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index db6ca11f40235..e3b1306ef6d8f 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -1347,6 +1347,11 @@ Parameters: The other is <> to enable profiling events returned from submissions of the executable graph. +Exceptions: + +* Throws synchronously with error code `feature_not_supported` if the graph + contains a command that is not supported by the device. + Returns: A new executable graph object which can be submitted to a queue. | diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 35f3b461bc01b..cf9ca70196612 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -985,9 +985,9 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, // and potential hangs. We have therefore to expliclty wait in the host // for previous submission to complete before resubmitting the // command-buffer for level-zero backend. - // TODO : add a check to release this constraint and allow multiple - // concurrent submissions if the exec_graph has been updated since the - // last submission. + // TODO https://github.com/intel/llvm/issues/17734 + // Remove this backend specific behavior and allow multiple concurrent + // submissions of the UR command-buffer. for (std::vector::iterator It = MExecutionEvents.begin(); It != MExecutionEvents.end();) { diff --git a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst index 54832cf5da7d4..f20c84bb027d9 100644 --- a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst @@ -58,11 +58,11 @@ to provide additional properties for how the command-buffer should be constructed. The members defined in ${x}_exp_command_buffer_desc_t are: * ``isUpdatable``, which should be set to ``true`` to support :ref:`updating -command-buffer commands`. + command-buffer commands`. * ``isInOrder``, which should be set to ``true`` to enable commands enqueued to -a command-buffer to be executed in an in-order fashion where possible. + a command-buffer to be executed in an in-order fashion where possible. * ``enableProfiling``, which should be set to ``true`` to enable profiling of -the command-buffer. + the command-buffer. Command-buffers are reference counted and can be retained and released by calling ${x}CommandBufferRetainExp and ${x}CommandBufferReleaseExp respectively. @@ -226,15 +226,30 @@ Enqueueing Command-Buffers Command-buffers are submitted for execution on a ${x}_queue_handle_t with an optional list of dependent events. An event is returned which tracks the execution of the command-buffer, and will be complete when all appended commands -have finished executing. It is adapter specific whether command-buffers can be -enqueued or executed simultaneously, and submissions may be serialized. +have finished executing. .. parsed-literal:: ${x}_event_handle_t executionEvent; - ${x}EnqueueCommandBufferExp(hQueue, hCommandBuffer, 0, nullptr, &executionEvent); +A command-buffer can be submitted for execution while a previous submission +of the same command-buffer is still awaiting completion. That is, the user is not +required to do a blocking wait on the completion of the first command-buffer +submission before making a second submission of the command-buffer. + +Submissions of the same command-buffer should be synchronized to prevent +concurrent execution. For example, by using events, barriers, or in-order queue +dependencies. The behavior of multiple submissions of the same command-buffer +that can execute concurrently is undefined. + +.. parsed-literal:: + // Valid usage if hQueue is in-order but undefined behavior is out-of-order + ${x}EnqueueCommandBufferExp(hQueue, hCommandBuffer, 0, nullptr, + nullptr); + ${x}EnqueueCommandBufferExp(hQueue, hCommandBuffer, 0, nullptr, + nullptr); + Updating Command-Buffer Commands -------------------------------------------------------------------------------- diff --git a/unified-runtime/source/adapters/opencl/device.cpp b/unified-runtime/source/adapters/opencl/device.cpp index 3e466b9f04dbb..0b9552e3390c1 100644 --- a/unified-runtime/source/adapters/opencl/device.cpp +++ b/unified-runtime/source/adapters/opencl/device.cpp @@ -1524,9 +1524,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, CL_RETURN_ON_FAILURE(clGetDeviceInfo(Dev, CL_DEVICE_EXTENSIONS, ExtSize, ExtStr.data(), nullptr)); - std::string SupportedExtensions(ExtStr.c_str()); - return ReturnValue(ExtStr.find("cl_khr_command_buffer") != - std::string::npos); + // cl_khr_command_buffer is required for UR command-buffer support + cl_device_command_buffer_capabilities_khr Caps = 0; + if (ExtStr.find("cl_khr_command_buffer") != std::string::npos) { + // A UR command-buffer user needs to be able to enqueue another + // submission of the same UR command-buffer without having to manually + // check if the first submission has completed. + CL_RETURN_ON_FAILURE( + clGetDeviceInfo(Dev, CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR, + sizeof(Caps), &Caps, nullptr)); + } + + return ReturnValue( + 0 != (Caps & CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR)); } case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { cl_device_id Dev = cl_adapter::cast(hDevice); diff --git a/unified-runtime/test/conformance/exp_command_buffer/fill.cpp b/unified-runtime/test/conformance/exp_command_buffer/fill.cpp index ed057d26400cc..e2f03522e544a 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/fill.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/fill.cpp @@ -122,6 +122,31 @@ TEST_P(urCommandBufferFillCommandsTest, Buffer) { verifyData(output, size); } +TEST_P(urCommandBufferFillCommandsTest, ExecuteTwice) { + // TODO https://github.com/intel/llvm/issues/17734 + // Fail on Level-Zero due to blocking wait code in graph_impl.cpp specific + // to the level-zero backend that needs moved into the Level-Zero v1 adapter. + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp( + cmd_buf_handle, buffer, pattern.data(), pattern_size, 0, size, 0, nullptr, + 0, nullptr, &sync_point, nullptr, nullptr)); + + std::vector output(size, 1); + ASSERT_SUCCESS(urCommandBufferAppendMemBufferReadExp( + cmd_buf_handle, buffer, 0, size, output.data(), 1, &sync_point, 0, + nullptr, nullptr, nullptr, nullptr)); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + ASSERT_SUCCESS( + urEnqueueCommandBufferExp(queue, cmd_buf_handle, 0, nullptr, nullptr)); + ASSERT_SUCCESS( + urEnqueueCommandBufferExp(queue, cmd_buf_handle, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + verifyData(output, size); +} + TEST_P(urCommandBufferFillCommandsTest, USM) { ASSERT_SUCCESS(urCommandBufferAppendUSMFillExp( cmd_buf_handle, device_ptr, pattern.data(), pattern_size, size, 0,