Skip to content
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
0920893
Keep sycl::detail::handler_impl in stack when possible.
Alexandr-Konovalov Mar 5, 2025
a555257
Code formatting.
Alexandr-Konovalov Mar 10, 2025
8255f7c
Keep StorageInitHelper::MEvents in pool.
Alexandr-Konovalov Mar 10, 2025
af7d882
Eliminate std::shared_ptr<detail::queue_impl> from sycl::handler.
Alexandr-Konovalov Mar 18, 2025
67d00a3
Fix keeping reference to temporary object.
Alexandr-Konovalov Mar 18, 2025
18db4f0
Fix after merge.
Alexandr-Konovalov Mar 19, 2025
7db0650
Can't use unique_ptr because of deleter.
Alexandr-Konovalov Mar 20, 2025
8539879
Add missing variable initialization.
Alexandr-Konovalov Mar 20, 2025
db3f229
Merge branch 'sycl' into Alexandr-Konovalov/handler-in-stack
Alexandr-Konovalov Mar 25, 2025
13decb2
Merge branch 'sycl' into Alexandr-Konovalov/handler-in-stack
Alexandr-Konovalov Apr 7, 2025
d74d8dd
Fix after merge.
Alexandr-Konovalov Apr 7, 2025
e81a2e9
Merge branch 'sycl' into Alexandr-Konovalov/handler-in-stack
Alexandr-Konovalov Apr 7, 2025
1682ee1
Code formatting.
Alexandr-Konovalov Apr 7, 2025
130fe0b
Code formatting.
Alexandr-Konovalov Apr 7, 2025
57b0be5
Rollback pmr addition.
Alexandr-Konovalov Apr 7, 2025
04aedf6
Merge branch 'sycl' into Alexandr-Konovalov/handler-in-stack
Alexandr-Konovalov Apr 8, 2025
399c20b
Move new functionality under __INTEL_PREVIEW_BREAKING_CHANGES.
Alexandr-Konovalov Apr 8, 2025
311cfe3
Code formatting. Add newly exported Linux symbols.
Alexandr-Konovalov Apr 9, 2025
182d52f
Drop invaild const.
Alexandr-Konovalov Apr 9, 2025
18ac302
Do not export symbols that are used only internal.
Alexandr-Konovalov Apr 9, 2025
d46299f
Code formatting.
Alexandr-Konovalov Apr 9, 2025
4bb0838
Remove unneeded changes.
Alexandr-Konovalov Apr 9, 2025
c6eaf90
Drop creation of copy of Self on hot path.
Alexandr-Konovalov Apr 10, 2025
1dbeab7
Merge branch 'sycl' into Alexandr-Konovalov/handler-in-stack
Alexandr-Konovalov Apr 16, 2025
e874cc9
Fix after merge.
Alexandr-Konovalov Apr 17, 2025
919b4a8
Code formatting.
Alexandr-Konovalov Apr 17, 2025
5495e7b
Add newly exported Windows symbol.
Alexandr-Konovalov Apr 22, 2025
f301472
Merge branch 'sycl' into Alexandr-Konovalov/handler-in-stack
Alexandr-Konovalov Apr 30, 2025
096f96f
Fix after merge.
Alexandr-Konovalov Apr 30, 2025
2303792
Merge branch 'sycl' into Alexandr-Konovalov/handler-in-stack
Alexandr-Konovalov Apr 30, 2025
c9d3ece
Drop handler::handler(detail::handler_impl *HandlerImpl).
Alexandr-Konovalov May 2, 2025
5a35d92
Update sycl/include/sycl/handler.hpp
Alexandr-Konovalov May 2, 2025
fc15674
Update sycl/include/sycl/reduction.hpp
Alexandr-Konovalov May 6, 2025
571582a
Update sycl/include/sycl/reduction.hpp
Alexandr-Konovalov May 6, 2025
e87dcb7
Unify check style.
Alexandr-Konovalov May 6, 2025
9993669
Unify check style.
Alexandr-Konovalov May 6, 2025
4dc2485
Unify check style.
Alexandr-Konovalov May 6, 2025
00e7268
Merge branch 'sycl' into Alexandr-Konovalov/handler-in-stack
Alexandr-Konovalov May 6, 2025
73d7fae
Merge branch 'sycl' into Alexandr-Konovalov/handler-in-stack
Alexandr-Konovalov May 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -427,8 +427,23 @@ class __SYCL_EXPORT handler {
/// \param Queue is a SYCL queue.
/// \param CallerNeedsEvent indicates if the event resulting from this handler
/// is needed by the caller.
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
handler(const std::shared_ptr<detail::queue_impl> &Queue,
bool CallerNeedsEvent);
#else
handler(std::shared_ptr<detail::queue_impl> Queue, bool CallerNeedsEvent);
#endif

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
/// Constructs SYCL handler from the pre-constructed handler_impl and the
/// associated queue. Inside of Graph implementation, the Queue value is not
// used, for those cases it can be initialized with an empty shared_ptr.
///
/// \param HandlerImpl is a pre-constructed handler_impl.
/// \param Queue is a SYCL queue.
handler(detail::handler_impl *HandlerImpl,
const std::shared_ptr<detail::queue_impl> &Queue);
#else
/// Constructs SYCL handler from the associated queue and the submission's
/// primary and secondary queue.
///
Expand All @@ -450,14 +465,17 @@ class __SYCL_EXPORT handler {
__SYCL_DLL_LOCAL handler(std::shared_ptr<detail::queue_impl> Queue,
detail::queue_impl *SecondaryQueue,
bool CallerNeedsEvent);
#endif

#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES)
/// Constructs SYCL handler from Graph.
///
/// The handler will add the command-group as a node to the graph rather than
/// enqueueing it straight away.
///
/// \param Graph is a SYCL command_graph
handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
#endif

void *storeRawArg(const void *Ptr, size_t Size);

Expand Down Expand Up @@ -3273,8 +3291,18 @@ class __SYCL_EXPORT handler {
uint64_t SignalValue);

private:
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
// In some cases we need to construct handler_impl in heap. Sole propose
// of MImplOwner is to destroy handler_impl in destructor of handler.
// Can't use unique_ptr because declaration of handler_impl is not available
// in this header.
std::shared_ptr<detail::handler_impl> MImplOwner;
detail::handler_impl *impl;
const std::shared_ptr<detail::queue_impl> &MQueue;
#else
std::shared_ptr<detail::handler_impl> impl;
std::shared_ptr<detail::queue_impl> MQueue;
#endif
std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
detail::ABINeutralKernelNameStrT MKernelName;
Expand Down Expand Up @@ -3741,6 +3769,11 @@ class __SYCL_EXPORT handler {

friend class detail::HandlerAccess;

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_DLL_LOCAL detail::handler_impl *get_impl() { return impl; }
#else
__SYCL_DLL_LOCAL detail::handler_impl *get_impl() { return impl.get(); }
#endif
// Friend free-functions for asynchronous allocation and freeing.
__SYCL_EXPORT friend void
ext::oneapi::experimental::async_free(sycl::handler &h, void *ptr);
Expand Down
54 changes: 37 additions & 17 deletions sycl/include/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,19 @@ template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
return sycl::detail::make_tuple(Elements...);
}

#if __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT size_t reduGetMaxWGSize(const std::shared_ptr<queue_impl> &Queue,
size_t LocalMemBytesPerWorkItem);
__SYCL_EXPORT size_t reduGetPreferredWGSize(
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem);
#else
__SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
size_t LocalMemBytesPerWorkItem);
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
size_t &NWorkGroups);
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
size_t LocalMemBytesPerWorkItem);
#endif
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
size_t &NWorkGroups);

template <typename T, class BinaryOperation, bool IsOptional>
class ReducerElement;
Expand Down Expand Up @@ -1071,7 +1078,12 @@ class reduction_impl_algo {
std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
CGH.addReduction(Counter);

addCounterInit(CGH, CGH.MQueue, Counter);
#if __INTEL_PREVIEW_BREAKING_CHANGES
std::shared_ptr<detail::queue_impl> Queue(CGH.MQueue);
#else
std::shared_ptr<detail::queue_impl> &Queue = CGH.MQueue;
#endif
addCounterInit(CGH, Queue, Counter);

return Counter.get();
}
Expand Down Expand Up @@ -1229,7 +1241,8 @@ template <>
struct NDRangeReduction<reduction::strategy::local_atomic_and_atomic_cross_wg> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
static_assert(Reduction::has_identity,
Expand Down Expand Up @@ -1280,7 +1293,8 @@ struct NDRangeReduction<
reduction::strategy::group_reduce_and_last_wg_detection> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
static_assert(Reduction::has_identity,
Expand Down Expand Up @@ -1479,7 +1493,8 @@ void doTreeReductionOnTuple(size_t WorkSize, size_t LID,
template <> struct NDRangeReduction<reduction::strategy::range_basic> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
using reducer_type = typename Reduction::reducer_type;
Expand Down Expand Up @@ -1590,7 +1605,8 @@ template <>
struct NDRangeReduction<reduction::strategy::group_reduce_and_atomic_cross_wg> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
static_assert(Reduction::has_identity,
Expand Down Expand Up @@ -1626,7 +1642,8 @@ struct NDRangeReduction<
reduction::strategy::local_mem_tree_and_atomic_cross_wg> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
using reducer_type = typename Reduction::reducer_type;
Expand Down Expand Up @@ -1687,7 +1704,8 @@ struct NDRangeReduction<
reduction::strategy::group_reduce_and_multiple_kernels> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
static_assert(Reduction::has_identity,
Expand Down Expand Up @@ -1825,7 +1843,8 @@ struct NDRangeReduction<
template <> struct NDRangeReduction<reduction::strategy::basic> {
template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
using element_type = typename Reduction::reducer_element_type;
Expand Down Expand Up @@ -2600,9 +2619,9 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
template <> struct NDRangeReduction<reduction::strategy::multi> {
template <typename KernelName, int Dims, typename PropertiesT,
typename... RestT>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
RestT... Rest) {
static void
run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
std::tuple<RestT...> ArgsTuple(Rest...);
constexpr size_t NumArgs = sizeof...(RestT);
auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
Expand Down Expand Up @@ -2644,7 +2663,8 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {

template <typename KernelName, int Dims, typename PropertiesT,
typename KernelType, typename Reduction>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
static void run(handler &CGH,
const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
Reduction &Redu, KernelType &KernelFunc) {
auto Delegate = [&](auto Impl) {
Expand Down Expand Up @@ -2691,9 +2711,9 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
}
template <typename KernelName, int Dims, typename PropertiesT,
typename... RestT>
static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties,
RestT... Rest) {
static void
run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
Rest...);
}
Expand Down
10 changes: 10 additions & 0 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,12 @@ graph_impl::add(std::function<void(handler &)> CGF,
const std::vector<sycl::detail::ArgDesc> &Args,
std::vector<std::shared_ptr<node_impl>> &Deps) {
(void)Args;
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::handler_impl HandlerImpl{shared_from_this()};
sycl::handler Handler{&HandlerImpl, std::shared_ptr<detail::queue_impl>{}};
#else
sycl::handler Handler{shared_from_this()};
#endif

#if XPTI_ENABLE_INSTRUMENTATION
// Save code location if one was set in TLS.
Expand Down Expand Up @@ -2183,7 +2188,12 @@ void dynamic_command_group_impl::finalizeCGFList(
const auto &CGF = CGFList[CGFIndex];
// Handler defined inside the loop so it doesn't appear to the runtime
// as a single command-group with multiple commands inside.
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::handler_impl HandlerImpl{MGraph};
sycl::handler Handler{&HandlerImpl, std::shared_ptr<detail::queue_impl>{}};
#else
sycl::handler Handler{MGraph};
#endif
CGF(Handler);

if (Handler.getType() != sycl::detail::CGType::Kernel &&
Expand Down
13 changes: 13 additions & 0 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,15 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
const detail::code_location &Loc,
bool IsTopCodeLoc,
const SubmissionInfo &SubmitInfo) {
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::handler_impl HandlerImplVal(SecondaryQueue, CallerNeedsEvent);
detail::handler_impl *HandlerImpl = &HandlerImplVal;
handler Handler(HandlerImpl, Self);
#else
handler Handler(Self, SecondaryQueue, CallerNeedsEvent);
auto &HandlerImpl = detail::getSyclObjImpl(Handler);
#endif

#ifdef XPTI_ENABLE_INSTRUMENTATION
if (xptiTraceEnabled()) {
Handler.saveCodeLoc(Loc, IsTopCodeLoc);
Expand Down Expand Up @@ -371,8 +378,14 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
const detail::code_location &Loc,
bool IsTopCodeLoc,
const SubmissionInfo &SubmitInfo) {
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::handler_impl HandlerImplVal(PrimaryQueue.get(), CallerNeedsEvent);
detail::handler_impl *HandlerImpl = &HandlerImplVal;
handler Handler(HandlerImpl, Self);
#else
handler Handler(Self, CallerNeedsEvent);
auto &HandlerImpl = detail::getSyclObjImpl(Handler);
#endif

#if XPTI_ENABLE_INSTRUMENTATION
if (xptiTraceEnabled()) {
Expand Down
11 changes: 11 additions & 0 deletions sycl/source/detail/reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,15 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
return NumThreads;
}

#if __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT size_t
reduGetMaxWGSize(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
size_t LocalMemBytesPerWorkItem) {
#else
__SYCL_EXPORT size_t
reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
size_t LocalMemBytesPerWorkItem) {
#endif
device Dev = Queue->get_device();
size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();

Expand Down Expand Up @@ -113,8 +119,13 @@ reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
return WGSize;
}

#if __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT size_t reduGetPreferredWGSize(
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem) {
#else
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
size_t LocalMemBytesPerWorkItem) {
#endif
// TODO: Graphs extension explicit API uses a handler with a null queue to
// process CGFs, in future we should have access to the device so we can
// correctly calculate this.
Expand Down
1 change: 1 addition & 0 deletions sycl/source/detail/scheduler/commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ class Command {
static std::vector<ur_event_handle_t>
getUrEvents(const std::vector<EventImplPtr> &EventImpls,
const QueueImplPtr &CommandQueue, bool IsHostTaskCommand);

/// Collect UR events from EventImpls and filter out some of them in case of
/// in order queue. Does blocking enqueue if event is expected to produce ur
/// event but has empty native handle.
Expand Down
Loading