diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index b782b017191ed..f1627403edcc3 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -122,7 +122,7 @@ elseif(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/ianayl/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 62720fd524b37..6c6736cbb151b 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit d03f19a88e42cb98be9604ff24b61190d1e48727 -# Merge: 3ce6fcc9 84454b0e -# Author: Kenneth Benzie (Benie) -# Date: Thu Feb 13 11:43:34 2025 +0000 -# Merge pull request #2680 from ldorau/Set_UMF_CUDA_INCLUDE_DIR_to_not_fetch_cudart_from_gitlab -# Do not fetch cudart from gitlab for UMF -set(UNIFIED_RUNTIME_TAG d03f19a88e42cb98be9604ff24b61190d1e48727) +# commit 5e33542d592b63fc4d35fc4ffd02efb5c33395ec +# Merge: 1b0f730f 3ce6fcc9 +# Author: Li, Ian +# Date: Wed Feb 12 14:47:55 2025 -0800 +# +# Merge branch 'main' of https://github.com/oneapi-src/unified-runtime into ianayl/2way-prefetch +set(UNIFIED_RUNTIME_TAG 5e33542d592b63fc4d35fc4ffd02efb5c33395ec) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc index 933a6aabd2bd4..63f891b1e35b9 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc @@ -633,14 +633,22 @@ a! ---- namespace sycl::ext::oneapi::experimental { -void prefetch(sycl::queue q, void* ptr, size_t numBytes); +enum class prefetch_type { device, host }; -void prefetch(sycl::handler &h, void* ptr, size_t numBytes); +void prefetch(sycl::queue q, void* ptr, size_t numBytes, + prefetch_type type = prefetch_type::device); + +void prefetch(sycl::handler &h, void* ptr, size_t numBytes, + prefetch_type type = prefetch_type::device); } ---- !==== -_Effects_: Enqueues a `prefetch` to the `sycl::queue` or `sycl::handler`. +_Effects_: Enqueues a `prefetch` to the `sycl::queue` or `sycl::handler`. The +`type` parameter tells the direction of the prefetch operation: When the value +is `prefetch_type::device`, the memory is prefetched _to_ the device associated +with the queue. When the value is `prefetch_type::host`, the memory is +prefetched _to_ the host, regardless of the device on which it currently resides. a| [frame=all,grid=none] diff --git a/sycl/include/sycl/detail/cg_types.hpp b/sycl/include/sycl/detail/cg_types.hpp index c8f7a4ef261c4..3f678b911b0cd 100644 --- a/sycl/include/sycl/detail/cg_types.hpp +++ b/sycl/include/sycl/detail/cg_types.hpp @@ -8,26 +8,26 @@ #pragma once -#include // for array -#include // for InitializedVal, NDLoop -#include // for Builder -#include // for HostProfilingInfo -#include // for id -#include // for kernel_param_kind_t +#include // for array +#include // for InitializedVal, NDLoop +#include // for Builder +#include // for HostProfilingInfo +#include // for id +#include // for kernel_param_kind_t #include -#include // for group -#include // for h_item -#include // for id -#include // for item -#include // for kernel_handler -#include // for nd_item -#include // for nd_range -#include // for range, operator* - -#include // for function -#include // for size_t -#include // for enable_if_t, false_type -#include // for declval +#include // for group +#include // for h_item +#include // for id +#include // for item +#include // for kernel_handler +#include // for nd_item +#include // for nd_range +#include // for range, operator* + +#include // for function +#include // for size_t +#include // for enable_if_t, false_type +#include // for declval namespace sycl { inline namespace _V1 { @@ -65,6 +65,11 @@ enum class CGType : unsigned int { SemaphoreSignal = 25, ProfilingTag = 26, EnqueueNativeCommand = 27, +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + PrefetchUSMExp = 28, +#else + PrefetchUSMExpD2H = 28, +#endif }; template struct check_fn_signature { diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 1f11db722b60a..ee8b5cc8ca3c4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -364,14 +365,24 @@ void fill(sycl::queue Q, T *Ptr, const T &Pattern, size_t Count, submit(Q, [&](handler &CGH) { fill(CGH, Ptr, Pattern, Count); }, CodeLoc); } -inline void prefetch(handler &CGH, void *Ptr, size_t NumBytes) { - CGH.prefetch(Ptr, NumBytes); +inline void prefetch(handler &CGH, void *Ptr, size_t NumBytes, + prefetch_type Type = prefetch_type::device) { +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + CGH.ext_oneapi_prefetch_exp(Ptr, NumBytes, Type); +#else + if (Type == prefetch_type::device) { + CGH.prefetch(Ptr, NumBytes); + } else { + CGH.ext_oneapi_prefetch_d2h(Ptr, NumBytes); + } +#endif } inline void prefetch(queue Q, void *Ptr, size_t NumBytes, + prefetch_type Type = prefetch_type::device, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - submit(Q, [&](handler &CGH) { prefetch(CGH, Ptr, NumBytes); }, CodeLoc); + submit(Q, [&](handler &CGH) { prefetch(CGH, Ptr, NumBytes, Type); }, CodeLoc); } inline void mem_advise(handler &CGH, void *Ptr, size_t NumBytes, int Advice) { diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_types.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_types.hpp new file mode 100644 index 0000000000000..dacd45126a7fb --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_types.hpp @@ -0,0 +1,33 @@ +//==--------------- enqueue_types.hpp ---- SYCL enqueue types --------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { + +/// @brief Indicates the destination device for USM data to be prefetched to. +enum class prefetch_type { device, host }; + +inline std::string prefetchTypeToString(prefetch_type value) { + switch (value) { + case sycl::ext::oneapi::experimental::prefetch_type::device: + return "prefetch_type::device"; + case sycl::ext::oneapi::experimental::prefetch_type::host: + return "prefetch_type::host"; + default: + return "prefetch_type::unknown"; + } +} + +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 8765198f0bc0f..e228cde7e8900 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -151,10 +151,11 @@ template -class work_group_memory; +namespace ext::oneapi::experimental { +template class work_group_memory; struct image_descriptor; +enum class prefetch_type; +void prefetch(handler &CGH, void *Ptr, size_t NumBytes, prefetch_type Type); } // namespace ext::oneapi::experimental namespace ext::oneapi::experimental::detail { @@ -514,7 +515,8 @@ class __SYCL_EXPORT handler { /// Saves the location of user's code passed in \p CodeLoc for future usage in /// finalize() method. - /// TODO: remove the first version of this func (the one without the IsTopCodeLoc arg) + /// TODO: remove the first version of this func (the one without the + /// IsTopCodeLoc arg) /// at the next ABI breaking window since removing it breaks ABI on windows. void saveCodeLoc(detail::code_location CodeLoc); void saveCodeLoc(detail::code_location CodeLoc, bool IsTopCodeLoc); @@ -724,8 +726,9 @@ class __SYCL_EXPORT handler { detail::KernelLambdaHasKernelHandlerArgT::value; - MHostKernel = std::make_unique< - detail::HostKernel>(KernelFunc); + MHostKernel = + std::make_unique>( + KernelFunc); constexpr bool KernelHasName = detail::getKernelName() != nullptr && @@ -3441,6 +3444,10 @@ class __SYCL_EXPORT handler { void *MDstPtr = nullptr; /// Length to copy or fill (for USM operations). size_t MLength = 0; +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + /// USM prefetch direction. + ext::oneapi::experimental::prefetch_type MPrefetchType; +#endif /// Pattern that is used to fill memory object in case command type is fill. std::vector MPattern; /// Storage for a lambda or function object. @@ -3725,6 +3732,28 @@ class __SYCL_EXPORT handler { bool IsDeviceImageScoped, size_t NumBytes, size_t Offset); +// Enqueue function extension's implementation USM prefetch, enabling USM +// prefetch from both host to device, and device to host. +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + // With breaking changes enabled, the handler/CG Nodes have their fields + // modified. This function updates that field in the CG Node. + void ext_oneapi_prefetch_exp(const void *Ptr, size_t Count, + ext::oneapi::experimental::prefetch_type Type); + // This is a separate function to keep the current handler.prefetch function + // the same. +#else + // Without breaking changes, the handler/CG Nodes fields cannot be modified, + // meaning 1 CG node type cannot indicate both prefetch directions. Thus, the + // default handler.prefetch indicates host to device, and this function serves + // as device to host. + void ext_oneapi_prefetch_d2h(const void *Ptr, size_t Count); +#endif + // Friend prefetch from the enqueue functions extension to allow call to + // private function ext_oneapi_prefetch_d2h + friend void sycl::ext::oneapi::experimental::prefetch( + handler &CGH, void *Ptr, size_t NumBytes, + sycl::ext::oneapi::experimental::prefetch_type Type); + // Changing values in this will break ABI/API. enum class StableKernelCacheConfig : int32_t { Default = 0, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 82a5d17f1cac2..ff5ab50a428e3 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3518,7 +3518,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { queue(std::shared_ptr impl) : impl(impl) {} template - friend const decltype(Obj::impl)& detail::getSyclObjImpl(const Obj &SyclObject); + friend const decltype(Obj::impl) & + detail::getSyclObjImpl(const Obj &SyclObject); template friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj); diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index 8d3a5e56ac71a..05fd3ba54027d 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -8,24 +8,27 @@ #pragma once -#include // for AccessorImplHost, AccessorImplPtr -#include // for ArgDesc, HostTask, HostKernelBase -#include // for code_location -#include // for context_impl -#include // for ur_rect_region_t, ur_rect_offset_t -#include // for event_impl -#include // for queue_impl +#include // for AccessorImplHost, AccessorImplPtr +#include // for ArgDesc, HostTask, HostKernelBase +#include // for code_location +#include // for context_impl +#include // for ur_rect_region_t, ur_rect_offset_t +#include // for event_impl +#include // for queue_impl #include -#include // for kernel_impl -#include // for kernel_bundle_impl - -#include // for assert -#include // for shared_ptr, unique_ptr -#include // for size_t -#include // for int32_t -#include // for string -#include // for move -#include // for vector +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +#include // for prefetch_type +#endif +#include // for kernel_impl +#include // for kernel_bundle_impl + +#include // for assert +#include // for shared_ptr, unique_ptr +#include // for size_t +#include // for int32_t +#include // for string +#include // for move +#include // for vector namespace sycl { inline namespace _V1 { @@ -406,6 +409,41 @@ class CGPrefetchUSM : public CG { size_t getLength() { return MLength; } }; +/// Command group class for experimental USM prefetch provided in the enqueue +/// functions extension. +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +class CGPrefetchUSMExp : public CG { + void *MDst; + size_t MLength; + ext::oneapi::experimental::prefetch_type MPrefetchType; + +public: + CGPrefetchUSMExp(void *DstPtr, size_t Length, CG::StorageInitHelper CGData, + ext::oneapi::experimental::prefetch_type Type, + detail::code_location loc = {}) + : CG(CGType::PrefetchUSMExp, std::move(CGData), std::move(loc)), + MDst(DstPtr), MLength(Length), MPrefetchType(Type) {} + void *getDst() { return MDst; } + size_t getLength() { return MLength; } + ext::oneapi::experimental::prefetch_type getPrefetchType() { + return MPrefetchType; + } +}; +#else +class CGPrefetchUSMExpD2H : public CG { + void *MDst; + size_t MLength; + +public: + CGPrefetchUSMExpD2H(void *DstPtr, size_t Length, CG::StorageInitHelper CGData, + detail::code_location loc = {}) + : CG(CGType::PrefetchUSMExpD2H, std::move(CGData), std::move(loc)), + MDst(DstPtr), MLength(Length) {} + void *getDst() { return MDst; } + size_t getLength() { return MLength; } +}; +#endif + /// "Advise USM" command group class. class CGAdviseUSM : public CG { void *MDst; diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 1a1f2ef9cf55f..a8ed2c624d87e 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -243,6 +243,13 @@ class node_impl : public std::enable_shared_from_this { return createCGCopy(); case sycl::detail::CGType::PrefetchUSM: return createCGCopy(); +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case sycl::detail::CGType::PrefetchUSMExp: + return createCGCopy(); +#else + case sycl::detail::CGType::PrefetchUSMExpD2H: + return createCGCopy(); +#endif case sycl::detail::CGType::AdviseUSM: return createCGCopy(); case sycl::detail::CGType::Copy2DUSM: @@ -636,6 +643,31 @@ class node_impl : public std::enable_shared_from_this { << " Length: " << Prefetch->getLength() << "\\n"; } break; +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case sycl::detail::CGType::PrefetchUSMExp: + Stream << "CGPrefetchUSMExp \\n"; + if (Verbose) { + sycl::detail::CGPrefetchUSMExp *Prefetch = + static_cast(MCommandGroup.get()); + Stream << "Dst: " << Prefetch->getDst() + << " Length: " << Prefetch->getLength() << " Type: " + << sycl::ext::oneapi::experimental::prefetchTypeToString( + Prefetch->getPrefetchType()) + << "\\n"; + } + break; +#else + case sycl::detail::CGType::PrefetchUSMExpD2H: + Stream << "CGPrefetchUSM (Experimental, Device-To-Host) \\n"; + if (Verbose) { + sycl::detail::CGPrefetchUSMExpD2H *Prefetch = + static_cast( + MCommandGroup.get()); + Stream << "Dst: " << Prefetch->getDst() + << " Length: " << Prefetch->getLength() << "\\n"; + } + break; +#endif case sycl::detail::CGType::AdviseUSM: Stream << "CGAdviseUSM \\n"; if (Verbose) { diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 261b97966ae40..b6b2f981b06f1 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -964,17 +964,22 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, DepEvents.size(), DepEvents.data(), OutEvent); } -void MemoryManager::prefetch_usm(void *Mem, QueueImplPtr Queue, size_t Length, - std::vector DepEvents, - ur_event_handle_t *OutEvent, - const detail::EventImplPtr &OutEventImpl) { +void MemoryManager::prefetch_usm( + void *Mem, QueueImplPtr Queue, size_t Length, + std::vector DepEvents, ur_event_handle_t *OutEvent, + const detail::EventImplPtr &OutEventImpl, + sycl::ext::oneapi::experimental::prefetch_type Dest) { assert(Queue && "USM prefetch must be called with a valid device queue"); const AdapterPtr &Adapter = Queue->getAdapter(); + ur_usm_migration_flags_t MigrationFlag = + (Dest == sycl::ext::oneapi::experimental::prefetch_type::device) + ? UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE + : UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST; if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); - Adapter->call(Queue->getHandleRef(), Mem, - Length, 0, DepEvents.size(), - DepEvents.data(), OutEvent); + Adapter->call( + Queue->getHandleRef(), Mem, Length, MigrationFlag, DepEvents.size(), + DepEvents.data(), OutEvent); } void MemoryManager::advise_usm(const void *Mem, QueueImplPtr Queue, @@ -1613,11 +1618,16 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer( sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector Deps, - ur_exp_command_buffer_sync_point_t *OutSyncPoint) { + ur_exp_command_buffer_sync_point_t *OutSyncPoint, + sycl::ext::oneapi::experimental::prefetch_type Dest) { const AdapterPtr &Adapter = Context->getAdapter(); + ur_usm_migration_flags_t MigrationFlag = + (Dest == sycl::ext::oneapi::experimental::prefetch_type::device) + ? UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE + : UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST; Adapter->call( - CommandBuffer, Mem, Length, ur_usm_migration_flags_t(0), Deps.size(), - Deps.data(), 0, nullptr, OutSyncPoint, nullptr, nullptr); + CommandBuffer, Mem, Length, MigrationFlag, Deps.size(), Deps.data(), 0, + nullptr, OutSyncPoint, nullptr, nullptr); } void MemoryManager::ext_oneapi_advise_usm_cmd_buffer( diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index cc573abc62ddb..9c63a9bd7f915 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -11,6 +11,7 @@ #include #include #include +#include // for prefetch_type #include #include #include @@ -149,10 +150,13 @@ class MemoryManager { ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl); - static void prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, - std::vector DepEvents, - ur_event_handle_t *OutEvent, - const detail::EventImplPtr &OutEventImpl); + static void + prefetch_usm(void *Ptr, QueueImplPtr Queue, size_t Len, + std::vector DepEvents, + ur_event_handle_t *OutEvent, + const detail::EventImplPtr &OutEventImpl, + sycl::ext::oneapi::experimental::prefetch_type Dest = + sycl::ext::oneapi::experimental::prefetch_type::device); static void advise_usm(const void *Ptr, QueueImplPtr Queue, size_t Len, ur_usm_advice_flags_t Advice, @@ -251,7 +255,9 @@ class MemoryManager { sycl::detail::ContextImplPtr Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector Deps, - ur_exp_command_buffer_sync_point_t *OutSyncPoint); + ur_exp_command_buffer_sync_point_t *OutSyncPoint, + sycl::ext::oneapi::experimental::prefetch_type Dest = + sycl::ext::oneapi::experimental::prefetch_type::device); static void ext_oneapi_advise_usm_cmd_buffer( sycl::detail::ContextImplPtr Context, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 005008a74ebd0..a44d029e33bbd 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -28,6 +28,7 @@ #include #include #include +#include // For prefetch_type #include #include @@ -1939,6 +1940,15 @@ static std::string_view cgTypeToString(detail::CGType Type) { case detail::CGType::PrefetchUSM: return "prefetch usm"; break; +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case detail::CGType::PrefetchUSMExp: + return "prefetch usm (enqueue extension, experimental)"; + break; +#else + case detail::CGType::PrefetchUSMExpD2H: + return "prefetch usm (experimental, device to host)"; + break; +#endif case detail::CGType::CodeplayHostTask: return "host task"; break; @@ -3025,13 +3035,43 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { if (auto Result = callMemOpHelper( MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, MQueue->getContextImplPtr(), MCommandBuffer, Prefetch->getDst(), - Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint); + Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, + sycl::ext::oneapi::experimental::prefetch_type::device); Result != UR_RESULT_SUCCESS) return Result; MEvent->setSyncPoint(OutSyncPoint); return UR_RESULT_SUCCESS; } +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case CGType::PrefetchUSMExp: { + CGPrefetchUSMExp *Prefetch = (CGPrefetchUSMExp *)MCommandGroup.get(); + if (auto Result = callMemOpHelper( + MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, + MQueue->getContextImplPtr(), MCommandBuffer, Prefetch->getDst(), + Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, + Prefetch->getPrefetchType()); + Result != UR_RESULT_SUCCESS) + return Result; + + MEvent->setSyncPoint(OutSyncPoint); + return UR_RESULT_SUCCESS; + } +#else + case CGType::PrefetchUSMExpD2H: { + CGPrefetchUSMExpD2H *Prefetch = (CGPrefetchUSMExpD2H *)MCommandGroup.get(); + if (auto Result = callMemOpHelper( + MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, + MQueue->getContextImplPtr(), MCommandBuffer, Prefetch->getDst(), + Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, + sycl::ext::oneapi::experimental::prefetch_type::host); + Result != UR_RESULT_SUCCESS) + return Result; + + MEvent->setSyncPoint(OutSyncPoint); + return UR_RESULT_SUCCESS; + } +#endif case CGType::AdviseUSM: { CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get(); if (auto Result = callMemOpHelper( @@ -3238,13 +3278,43 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get(); if (auto Result = callMemOpHelper( MemoryManager::prefetch_usm, Prefetch->getDst(), MQueue, - Prefetch->getLength(), std::move(RawEvents), Event, MEvent); + Prefetch->getLength(), std::move(RawEvents), Event, MEvent, + sycl::ext::oneapi::experimental::prefetch_type::device); + Result != UR_RESULT_SUCCESS) + return Result; + + SetEventHandleOrDiscard(); + return UR_RESULT_SUCCESS; + } + // TODO Either remove non __INTEL_PREVIEW_BREAKING_CHANGES implementation upon + // next ABI breaking window, or rename PrefetchUSM to "PrefetchUSMH2D" +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case CGType::PrefetchUSMExp: { + CGPrefetchUSMExp *Prefetch = (CGPrefetchUSMExp *)MCommandGroup.get(); + if (auto Result = + callMemOpHelper(MemoryManager::prefetch_usm, Prefetch->getDst(), + MQueue, Prefetch->getLength(), std::move(RawEvents), + Event, MEvent, Prefetch->getPrefetchType()); + Result != UR_RESULT_SUCCESS) + return Result; + + SetEventHandleOrDiscard(); + return UR_RESULT_SUCCESS; + } +#else + case CGType::PrefetchUSMExpD2H: { + CGPrefetchUSMExpD2H *Prefetch = (CGPrefetchUSMExpD2H *)MCommandGroup.get(); + if (auto Result = callMemOpHelper( + MemoryManager::prefetch_usm, Prefetch->getDst(), MQueue, + Prefetch->getLength(), std::move(RawEvents), Event, MEvent, + sycl::ext::oneapi::experimental::prefetch_type::host); Result != UR_RESULT_SUCCESS) return Result; SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } +#endif case CGType::AdviseUSM: { CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get(); if (auto Result = diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 692ffb84120ac..e90898ccd1a18 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -597,6 +597,17 @@ event handler::finalize() { CommandGroup.reset(new detail::CGPrefetchUSM( MDstPtr, MLength, std::move(impl->CGData), MCodeLoc)); break; +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case detail::CGType::PrefetchUSMExp: + CommandGroup.reset(new detail::CGPrefetchUSMExp( + MDstPtr, MLength, std::move(impl->CGData), MPrefetchType, MCodeLoc)); + break; +#else + case detail::CGType::PrefetchUSMExpD2H: + CommandGroup.reset(new detail::CGPrefetchUSMExpD2H( + MDstPtr, MLength, std::move(impl->CGData), MCodeLoc)); + break; +#endif case detail::CGType::AdviseUSM: CommandGroup.reset(new detail::CGAdviseUSM(MDstPtr, MLength, impl->MAdvice, std::move(impl->CGData), @@ -1187,6 +1198,26 @@ void handler::prefetch(const void *Ptr, size_t Count) { setType(detail::CGType::PrefetchUSM); } +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +void handler::ext_oneapi_prefetch_exp( + const void *Ptr, size_t Count, + ext::oneapi::experimental::prefetch_type Type) { + + throwIfActionIsCreated(); + MDstPtr = const_cast(Ptr); + MLength = Count; + MPrefetchType = Type; + setType(detail::CGType::PrefetchUSMExp); +} +#else +void handler::ext_oneapi_prefetch_d2h(const void *Ptr, size_t Count) { + throwIfActionIsCreated(); + MDstPtr = const_cast(Ptr); + MLength = Count; + setType(detail::CGType::PrefetchUSMExpD2H); +} +#endif + void handler::mem_advise(const void *Ptr, size_t Count, int Advice) { throwIfActionIsCreated(); MDstPtr = const_cast(Ptr); diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp new file mode 100644 index 0000000000000..21be1e250ef69 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -0,0 +1,89 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests the enqueue free function using buffers and submit + +#include "../graph_common.hpp" +#include + +static constexpr int N = 100; +static constexpr int Pattern = 42; + +int main() { + queue Q{}; + if (!Q.get_device().get_info()) { + // USM not supported, skipping test and returning early. + return 0; + } + + int *Src = + (int *)malloc_shared(sizeof(int) * N, Q.get_device(), Q.get_context()); + int *Dst = + (int *)malloc_shared(sizeof(int) * N, Q.get_device(), Q.get_context()); + for (int i = 0; i < N; i++) + Src[i] = Pattern; + + { + exp_ext::command_graph Graph{Q.get_context(), Q.get_device(), {}}; + + Graph.begin_recording(Q); + + // Test submitting host-to-device prefetch + event TestH2D = exp_ext::submit_with_event( + Q, [&](handler &CGH) { exp_ext::prefetch(CGH, Src, sizeof(int) * N); }); + + exp_ext::submit(Q, [&](handler &CGH) { + CGH.depends_on(TestH2D); + exp_ext::parallel_for(CGH, range<1>(N), [=](id<1> i) { + Dst[i] = Src[i] * 2; + }); + }); + + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + exp_ext::execute_graph(Q, GraphExec); + Q.wait_and_throw(); + } + + // Check host-to-device prefetch results + for (int i = 0; i < N; i++) + assert(Dst[i] == Pattern * 2); + + { + exp_ext::command_graph Graph{Q.get_context(), Q.get_device(), {}}; + + Graph.begin_recording(Q); + + // Test submitting device-to-host prefetch + event TestD2H = exp_ext::submit_with_event(Q, [&](handler &CGH) { + exp_ext::parallel_for(CGH, range<1>(N), [=](id<1> i) { + Dst[i] = Src[i] + 1; + }); + }); + + exp_ext::submit(Q, [&](handler &CGH) { + CGH.depends_on(TestD2H); + exp_ext::prefetch(CGH, Dst, sizeof(int) * N, + exp_ext::prefetch_type::host); + }); + + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + exp_ext::execute_graph(Q, GraphExec); + Q.wait_and_throw(); + } + + // Check device-to-host prefetch results + for (int i = 0; i < N; i++) + assert(Dst[i] == Pattern + 1); + + return 0; +} diff --git a/sycl/test-e2e/USM/prefetch_exp.cpp b/sycl/test-e2e/USM/prefetch_exp.cpp new file mode 100644 index 0000000000000..574f07077abf5 --- /dev/null +++ b/sycl/test-e2e/USM/prefetch_exp.cpp @@ -0,0 +1,116 @@ +//==-------- prefetch_exp.cpp - Experimental 2-way USM prefetch test -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// RUN: %{build} -o %t1.out +// RUN: %{run} %t1.out + +#include +#include +#include + +using namespace sycl; + +static constexpr int Count = 100; + +int main() { + queue q([](exception_list el) { + for (auto &e : el) + throw e; + }); + + if (!q.get_device().get_info()) { + // USM not supported, skipping test and returning early. + return 0; + } + + float *Src = (float *)malloc_shared(sizeof(float) * Count, q.get_device(), + q.get_context()); + float *Dest = (float *)malloc_shared(sizeof(float) * Count, q.get_device(), + q.get_context()); + for (int i = 0; i < Count; i++) + Src[i] = i; + + { + // Test host-to-device prefetch via prefetch(handler ...). + event InitPrefetch = + ext::oneapi::experimental::submit_with_event(q, [&](handler &CGH) { + ext::oneapi::experimental::prefetch(CGH, Src, sizeof(float) * Count); + }); + + q.submit([&](handler &CGH) { + CGH.depends_on(InitPrefetch); + CGH.single_task([=]() { + for (int i = 0; i < Count; i++) + Dest[i] = 2 * Src[i]; + }); + }); + q.wait_and_throw(); + + for (int i = 0; i < Count; i++) { + assert(Dest[i] == i * 2); + } + + // Test device-to-host prefetch via prefetch(handler ...). + event InitPrefetchBack = q.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (int i = 0; i < Count; i++) + Dest[i] = 4 * Src[i]; + }); + }); + + ext::oneapi::experimental::submit(q, [&](handler &CGH) { + CGH.depends_on(InitPrefetch); + ext::oneapi::experimental::prefetch( + CGH, Dest, sizeof(float) * Count, + ext::oneapi::experimental::prefetch_type::host); + }); + q.wait_and_throw(); + + for (int i = 0; i < Count; i++) { + assert(Dest[i] == i * 4); + } + } + + { + // Test host-to-device prefetch via prefetch(queue ...). + ext::oneapi::experimental::prefetch( + q, Src, sizeof(float) * Count, + ext::oneapi::experimental::prefetch_type::device); + q.wait_and_throw(); + q.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (int i = 0; i < Count; i++) + Dest[i] = 3 * Src[i]; + }); + }); + q.wait_and_throw(); + + for (int i = 0; i < Count; i++) { + assert(Dest[i] == i * 3); + } + + // Test device-to-host prefetch via prefetch(queue ...). + q.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (int i = 0; i < Count; i++) + Dest[i] = 6 * Src[i]; + }); + }); + q.wait_and_throw(); + ext::oneapi::experimental::prefetch( + q, Src, sizeof(float) * Count, + ext::oneapi::experimental::prefetch_type::host); + q.wait_and_throw(); + + for (int i = 0; i < Count; i++) { + assert(Dest[i] == i * 6); + } + } + free(Src, q); + free(Dest, q); +} diff --git a/sycl/unittests/Extensions/CMakeLists.txt b/sycl/unittests/Extensions/CMakeLists.txt index 3432f0ac3484f..4e9f67cf1fc9e 100644 --- a/sycl/unittests/Extensions/CMakeLists.txt +++ b/sycl/unittests/Extensions/CMakeLists.txt @@ -17,6 +17,7 @@ add_sycl_unittest(ExtensionsTests OBJECT NoDeviceIPVersion.cpp WorkGroupMemoryBackendArgument.cpp GetLastEvent.cpp + USMPrefetchExp.cpp BFloat16.cpp EventMode.cpp ) diff --git a/sycl/unittests/Extensions/USMPrefetchExp.cpp b/sycl/unittests/Extensions/USMPrefetchExp.cpp new file mode 100644 index 0000000000000..2f2b33cb50586 --- /dev/null +++ b/sycl/unittests/Extensions/USMPrefetchExp.cpp @@ -0,0 +1,67 @@ +#include +#include +#include +#include + +#include + +static constexpr int N = 8; +static ur_usm_migration_flags_t urUSMPrefetchDirection = -1; + +ur_result_t redefinedEnqueueUSMPrefetch(void *pParams) { + auto params = *static_cast(pParams); + urUSMPrefetchDirection = *(params.pflags); + return UR_RESULT_SUCCESS; +} + +TEST(USMPrefetchExp, CheckURCall) { + using namespace sycl; + unittest::UrMock<> Mock; + mock::getCallbacks().set_replace_callback("urEnqueueUSMPrefetch", + &redefinedEnqueueUSMPrefetch); + queue q; + int *Mem = + (int *)malloc_shared(sizeof(int) * N, q.get_device(), q.get_context()); + + // Check handler calls: + q.submit([&](handler &cgh) { + sycl::ext::oneapi::experimental::prefetch(cgh, Mem, sizeof(int) * N); + }); + q.wait_and_throw(); + EXPECT_EQ(urUSMPrefetchDirection, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE); + + q.submit([&](handler &cgh) { + sycl::ext::oneapi::experimental::prefetch( + cgh, Mem, sizeof(int) * N, + sycl::ext::oneapi::experimental::prefetch_type::device); + }); + q.wait_and_throw(); + EXPECT_EQ(urUSMPrefetchDirection, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE); + + q.submit([&](handler &cgh) { + sycl::ext::oneapi::experimental::prefetch( + cgh, Mem, sizeof(int) * N, + sycl::ext::oneapi::experimental::prefetch_type::host); + }); + q.wait_and_throw(); + EXPECT_EQ(urUSMPrefetchDirection, UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST); + + // Check queue calls: + sycl::ext::oneapi::experimental::prefetch(q, Mem, sizeof(int) * N); + q.wait_and_throw(); + EXPECT_EQ(urUSMPrefetchDirection, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE); + + sycl::ext::oneapi::experimental::prefetch( + q, Mem, sizeof(int) * N, + sycl::ext::oneapi::experimental::prefetch_type::device); + q.wait_and_throw(); + EXPECT_EQ(urUSMPrefetchDirection, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE); + + sycl::ext::oneapi::experimental::prefetch( + q, Mem, sizeof(int) * N, + sycl::ext::oneapi::experimental::prefetch_type::host); + q.wait_and_throw(); + EXPECT_EQ(urUSMPrefetchDirection, UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST); + + free(Mem, q); +}