diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 72841724fa01d..202bfdcdc19e3 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 4c07083d355c7..91a797c4e6b85 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -6,4 +6,4 @@ # Merge pull request #2578 from Bensuo/ewan/remove_command_ref_counting # # Remove command-buffer command handle ref counting -set(UNIFIED_RUNTIME_TAG 14f4a3ba70b91b3adc411ec6bfc8ae86e948a990) +set(UNIFIED_RUNTIME_TAG "ewan/native_command") diff --git a/sycl/include/sycl/detail/backend_traits_cuda.hpp b/sycl/include/sycl/detail/backend_traits_cuda.hpp index 89bef47d01a4b..9a4df94693329 100644 --- a/sycl/include/sycl/detail/backend_traits_cuda.hpp +++ b/sycl/include/sycl/detail/backend_traits_cuda.hpp @@ -24,6 +24,7 @@ typedef struct CUctx_st *CUcontext; typedef struct CUstream_st *CUstream; typedef struct CUevent_st *CUevent; typedef struct CUmod_st *CUmodule; +typedef struct CUgraph_st *CUgraph; // As defined in the CUDA 10.1 header file. This requires CUDA version > 3.2 #if defined(_WIN64) || defined(__LP64__) @@ -102,6 +103,16 @@ template <> struct BackendReturn { using type = CUstream; }; +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>; +template <> struct BackendInput { + using type = CUgraph; +}; + +template <> struct BackendReturn { + using type = CUgraph; +}; + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index e2e87c30ea945..6401bdbbab463 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -114,7 +114,8 @@ enum class node_type { prefetch = 6, memadvise = 7, ext_oneapi_barrier = 8, - host_task = 9 + host_task = 9, + native_command = 10 }; /// Class representing a node in the graph, returned by command_graph::add(). diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 552c6afe195be..b03e1698acacf 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1934,9 +1934,6 @@ class __SYCL_EXPORT handler { void(interop_handle)>::value> ext_codeplay_enqueue_native_command([[maybe_unused]] FuncT &&Func) { #ifndef __SYCL_DEVICE_ONLY__ - throwIfGraphAssociated< - ext::oneapi::experimental::detail::UnsupportedGraphFeatures:: - sycl_ext_codeplay_enqueue_native_command>(); ext_codeplay_enqueue_native_command_impl(Func); #endif } diff --git a/sycl/include/sycl/interop_handle.hpp b/sycl/include/sycl/interop_handle.hpp index 2e7408cf5c0f9..4415bdb2402ce 100644 --- a/sycl/include/sycl/interop_handle.hpp +++ b/sycl/include/sycl/interop_handle.hpp @@ -49,6 +49,9 @@ class interop_handle { /// interop_handle. __SYCL_EXPORT backend get_backend() const noexcept; + /// Returns true if command-group is being added to a graph as a node + __SYCL_EXPORT bool has_graph() const noexcept; + /// Receives a SYCL accessor that has been defined as a requirement for the /// command group, and returns the underlying OpenCL memory object that is /// used by the SYCL runtime. If the accessor passed as parameter is not part @@ -134,6 +137,26 @@ class interop_handle { #endif } + using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>; + template + backend_return_t get_native_graph() const { +#ifndef __SYCL_DEVICE_ONLY__ + // TODO: replace the exception thrown below with the SYCL 2020 exception + // with the error code 'errc::backend_mismatch' when those new exceptions + // are ready to be used. + if (Backend != get_backend()) + throw exception(make_error_code(errc::invalid), + "Incorrect backend argument was passed"); + + // C-style cast required to allow various native types + return (backend_return_t)getNativeGraph(); +#else + // we believe this won't be ever called on device side + return 0; +#endif + } + /// Returns the SYCL application interoperability native backend object /// associated with the device associated with the SYCL queue that the host /// task was submitted to. The native backend object returned must be in @@ -186,8 +209,9 @@ class interop_handle { interop_handle(std::vector MemObjs, const std::shared_ptr &Queue, const std::shared_ptr &Device, - const std::shared_ptr &Context) - : MQueue(Queue), MDevice(Device), MContext(Context), + const std::shared_ptr &Context, + const ur_exp_command_buffer_handle_t &Graph) + : MQueue(Queue), MDevice(Device), MContext(Context), MGraph(Graph), MMemObjs(std::move(MemObjs)) {} template @@ -211,10 +235,12 @@ class interop_handle { getNativeQueue(int32_t &NativeHandleDesc) const; __SYCL_EXPORT ur_native_handle_t getNativeDevice() const; __SYCL_EXPORT ur_native_handle_t getNativeContext() const; + __SYCL_EXPORT ur_native_handle_t getNativeGraph() const; std::shared_ptr MQueue; std::shared_ptr MDevice; std::shared_ptr MContext; + ur_exp_command_buffer_handle_t MGraph; std::vector MMemObjs; }; diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 5f9f79d878d03..c940b35c81448 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -824,6 +824,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode( std::shared_ptr Node) { // Queue which will be used for allocation operations for accessors. + // Will also be used in native commands to return to the user in + // `interop_handler::get_native_queue()` calls auto AllocaQueue = std::make_shared( DeviceImpl, sycl::detail::getSyclObjImpl(Ctx), sycl::async_handler{}, sycl::property_list{}); diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index e609123b4f285..f5d960b2d15e6 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -71,6 +71,8 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) { return node_type::host_task; case sycl::detail::CGType::ExecCommandBuffer: return node_type::subgraph; + case sycl::detail::CGType::EnqueueNativeCommand: + return node_type::native_command; default: assert(false && "Invalid Graph Node Type"); return node_type::empty; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 005008a74ebd0..d92077084fc2b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -454,7 +454,7 @@ class DispatchHostTask { "Host task submissions should have an associated queue"); interop_handle IH{MReqToMem, HostTask.MQueue, HostTask.MQueue->getDeviceImplPtr(), - HostTask.MQueue->getContextImplPtr()}; + HostTask.MQueue->getContextImplPtr(), nullptr}; // TODO: should all the backends that support this entry point use this // for host task? auto &Queue = HostTask.MQueue; @@ -2879,6 +2879,19 @@ ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue, return Error; } +namespace { + +struct CommandBufferNativeCommandData { + sycl::interop_handle ih; + std::function func; +}; + +void CommandBufferInteropFreeFunc(void *InteropData) { + auto *Data = reinterpret_cast(InteropData); + return Data->func(Data->ih); +} +} // namespace + ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { assert(MQueue && "Command buffer enqueue should have an associated queue"); // Wait on host command dependencies @@ -3045,6 +3058,55 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { MEvent->setSyncPoint(OutSyncPoint); return UR_RESULT_SUCCESS; } + case CGType::EnqueueNativeCommand: { + // Queue is created by graph_impl before creating command to submit to + // scheduler. + const AdapterPtr &Adapter = MQueue->getAdapter(); + const auto Backend = MQueue->get_device().get_backend(); + CGHostTask *HostTask = (CGHostTask *)MCommandGroup.get(); + + // TODO - Doc this + ur_exp_command_buffer_handle_t ChildCommandBuffer = nullptr; + if (Backend == sycl::backend::ext_oneapi_cuda || + Backend == sycl::backend::ext_oneapi_hip) { + + ur_exp_command_buffer_desc_t Desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC /*stype*/, + nullptr /*pnext*/, false /* updatable */, false /* in-order */, + false /* profilable*/ + }; + auto ContextImpl = sycl::detail::getSyclObjImpl(MQueue->get_context()); + auto DeviceImpl = sycl::detail::getSyclObjImpl(MQueue->get_device()); + Adapter->call( + ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc, + &ChildCommandBuffer); + } + + std::vector ReqToMem; // TODO work with buffers + interop_handle IH{ReqToMem, HostTask->MQueue, + HostTask->MQueue->getDeviceImplPtr(), + HostTask->MQueue->getContextImplPtr(), + ChildCommandBuffer ? ChildCommandBuffer : MCommandBuffer}; + CommandBufferNativeCommandData CustomOpData{ + IH, HostTask->MHostTask->MInteropTask}; + + Adapter->call( + MCommandBuffer, CommandBufferInteropFreeFunc, &CustomOpData, + ChildCommandBuffer, MSyncPointDeps.size(), + MSyncPointDeps.empty() ? nullptr : MSyncPointDeps.data(), + &OutSyncPoint); + + if (ChildCommandBuffer) { + ur_result_t Res = Adapter->call_nocheck< + sycl::detail::UrApiKind::urCommandBufferReleaseExp>( + ChildCommandBuffer); + (void)Res; + assert(Res == UR_RESULT_SUCCESS); + } + + MEvent->setSyncPoint(OutSyncPoint); + return UR_RESULT_SUCCESS; + } default: throw exception(make_error_code(errc::runtime), @@ -3416,7 +3478,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { EnqueueNativeCommandData CustomOpData{ interop_handle{ReqToMem, HostTask->MQueue, HostTask->MQueue->getDeviceImplPtr(), - HostTask->MQueue->getContextImplPtr()}, + HostTask->MQueue->getContextImplPtr(), nullptr}, HostTask->MHostTask->MInteropTask}; ur_bool_t NativeCommandSupport = false; diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp index aabf22702ef5f..d4912c6c8b3e4 100644 --- a/sycl/source/interop_handle.cpp +++ b/sycl/source/interop_handle.cpp @@ -23,6 +23,8 @@ backend interop_handle::get_backend() const noexcept { return detail::getImplBackend(MQueue); } +bool interop_handle::has_graph() const noexcept { return MGraph != nullptr; } + ur_native_handle_t interop_handle::getNativeMem(detail::Requirement *Req) const { auto Iter = std::find_if(std::begin(MMemObjs), std::end(MMemObjs), @@ -53,5 +55,17 @@ interop_handle::getNativeQueue(int32_t &NativeHandleDesc) const { return MQueue->getNative(NativeHandleDesc); } +ur_native_handle_t interop_handle::getNativeGraph() const { + if (!MGraph) { + throw exception(make_error_code(errc::invalid), + "Command-Group is not being added as a graph node"); + } + + auto Adapter = MQueue->getAdapter(); + ur_native_handle_t Handle; + Adapter->call(MGraph, + &Handle); + return Handle; +} } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp new file mode 100644 index 0000000000000..10acae92cd1a5 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp @@ -0,0 +1,87 @@ +// RUN: %{build} -g -o %t.out -lcuda +// RUN: %{run} %t.out +// REQUIRES: cuda + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (IH.has_graph()) { + CUgraph NativeGraph = IH.get_native_graph(); + CUgraphNode Node; + // TODO figure this out + /* + CUDA_MEMCPY3D &Params + 181 std::memset(&Params, 0, sizeof(CUDA_MEMCPY3D)); + 182 + 183 Params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + 184 Params.srcDevice = SrcType == + 185 ? *static_cast(SrcPtr) + 186 : 0; + 187 Params.srcHost = cType == CU_MEMORYTYPE_HOST ? SrcPtr : nullptr; + 188 Params.dstMemoryType = DstType; + 189 Params.dstDevice = + 190 DstType == CU_MEMORYTYPE_DEVICE ? *static_cast(DstPtr) : 0; + 191 Params.dstHost = DstType == CU_MEMORYTYPE_HOST ? DstPtr : nullptr; + 192 Params.WidthInBytes = Size; + 193 Params.Height = 1; + 194 Params.Depth = 1; + auto Res = cuGraphAddMemcpyNode1D(&Node, NativeGraph, nullptr, 0, + ((CUdeviceptr)PtrY, (CUdeviceptr)PtrX, + Size * sizeof(int), &Params, Context); + assert(Res == CUDA_SUCCESS); + */ + } else { + assert(false && "Native Handle should have a graph"); + } + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp new file mode 100644 index 0000000000000..8d91d51cc2798 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp @@ -0,0 +1,90 @@ +// RUN: %{build} -g -o %t.out -lcuda +// RUN: %{run} %t.out +// REQUIRES: cuda + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (IH.has_graph()) { + // Newly created stream for this node + auto NativeStream = IH.get_native_queue(); + // Graph already created with cuGraphCreate + CUgraph NativeGraph = IH.get_native_graph(); + + // Start stream capture + auto Res = + cuStreamBeginCapture(NativeStream, CU_STREAM_CAPTURE_MODE_GLOBAL); + assert(Res == CUDA_SUCCESS); + + // Add memcopy node + Res = cuMemcpyAsync((CUdeviceptr)PtrY, (CUdeviceptr)PtrX, + Size * sizeof(int), NativeStream); + assert(Res == CUDA_SUCCESS); + + // cuStreamEndCapture returns a new graph, if we overwrite + // "NativeGraph" it won't be picked up by the UR runtime, as it's + // a passed-by-value pointer + CUgraph RecordedGraph; + Res = cuStreamEndCapture(NativeStream, &RecordedGraph); + + // Add graph to native graph as a child node + // Need to return a node object for the node to be created, + // can't be nullptr. + CUgraphNode Node; + cuGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, RecordedGraph); + assert(Res == CUDA_SUCCESS); + } else { + assert(false && "Native Handle should have a graph"); + } + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +}