Skip to content

Commit 1e77cae

Browse files
committed
Merge branch 'no_handler_single_task' into temp_no_handler_integration_v2
2 parents f464e17 + c0345df commit 1e77cae

File tree

10 files changed

+102
-18
lines changed

10 files changed

+102
-18
lines changed

devops/scripts/benchmarks/html/scripts.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ function createChart(data, containerId, type) {
345345
if (elements.length > 0) {
346346
const point = elements[0].element.$context.raw;
347347
if (point.gitHash && point.gitRepo) {
348-
window.open(`https://github.com/${point.gitRepo}/commit/${point.gitHash}`, '_blank');
348+
window.open(`${point.gitRepo}/commit/${point.gitHash}`, '_blank');
349349
}
350350
}
351351
};

sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,13 +170,13 @@ class SYCLToolchain {
170170
llvm::vfs::getRealFileSystem());
171171
FS->pushOverlay(ToolchainFS);
172172
if (FSOverlay)
173-
FS->pushOverlay(FSOverlay);
173+
FS->pushOverlay(std::move(FSOverlay));
174174

175175
auto Files = llvm::makeIntrusiveRefCnt<clang::FileManager>(
176176
clang::FileSystemOptions{"." /* WorkingDir */}, FS);
177177

178178
Action A{FEAction};
179-
ToolInvocation TI{CommandLine, &A, Files.get(),
179+
ToolInvocation TI{std::move(CommandLine), &A, Files.get(),
180180
std::make_shared<PCHContainerOperations>()};
181181
TI.setDiagnosticConsumer(DiagConsumer ? DiagConsumer : &IgnoreDiag);
182182

sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,13 @@ template <typename KernelName = sycl::detail::auto_name, typename KernelType>
152152
void single_task(queue Q, const KernelType &KernelObj,
153153
const sycl::detail::code_location &CodeLoc =
154154
sycl::detail::code_location::current()) {
155+
/*
155156
submit(
156157
std::move(Q),
157158
[&](handler &CGH) { single_task<KernelName>(CGH, KernelObj); }, CodeLoc);
159+
*/
160+
detail::submit_kernel_direct_single_task<KernelName>(std::move(Q), empty_properties_t{},
161+
KernelObj);
158162
}
159163

160164
template <typename... ArgsT>
@@ -265,7 +269,7 @@ void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
265269
!(ext::oneapi::experimental::detail::
266270
HasKernelPropertiesGetMethod<
267271
const KernelType &>::value)) {
268-
detail::submit_kernel_direct<KernelName>(std::move(Q), empty_properties_t{},
272+
detail::submit_kernel_direct_parallel_for<KernelName>(std::move(Q), empty_properties_t{},
269273
Range, KernelObj);
270274
} else {
271275
submit(std::move(Q), [&](handler &CGH) {

sycl/include/sycl/khr/free_function_commands.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k,
161161
if constexpr (!(ext::oneapi::experimental::detail::
162162
HasKernelPropertiesGetMethod<
163163
const KernelType &>::value)) {
164-
detail::submit_kernel_direct(
164+
detail::submit_kernel_direct_parallel_for(
165165
q, ext::oneapi::experimental::empty_properties_t{},
166166
nd_range<1>(r, size), std::forward<KernelType>(k));
167167
} else {
@@ -179,7 +179,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k,
179179
if constexpr (!(ext::oneapi::experimental::detail::
180180
HasKernelPropertiesGetMethod<
181181
const KernelType &>::value)) {
182-
detail::submit_kernel_direct(
182+
detail::submit_kernel_direct_parallel_for(
183183
q, ext::oneapi::experimental::empty_properties_t{},
184184
nd_range<2>(r, size), std::forward<KernelType>(k));
185185
} else {
@@ -197,7 +197,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k,
197197
if constexpr (!(ext::oneapi::experimental::detail::
198198
HasKernelPropertiesGetMethod<
199199
const KernelType &>::value)) {
200-
detail::submit_kernel_direct(
200+
detail::submit_kernel_direct_parallel_for(
201201
q, ext::oneapi::experimental::empty_properties_t{},
202202
nd_range<3>(r, size), std::forward<KernelType>(k));
203203
} else {
@@ -314,7 +314,10 @@ template <typename KernelType>
314314
void launch_task(const sycl::queue &q, const KernelType &k,
315315
const sycl::detail::code_location &codeLoc =
316316
sycl::detail::code_location::current()) {
317-
submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
317+
//submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
318+
detail::submit_kernel_direct_single_task(q,
319+
ext::oneapi::experimental::empty_properties_t{},
320+
k, codeLoc);
318321
}
319322

320323
template <typename... Args>

sycl/include/sycl/queue.hpp

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ class __SYCL_EXPORT SubmissionInfo {
158158

159159
template <typename KernelName = detail::auto_name, bool EventNeeded = false,
160160
typename PropertiesT, typename KernelTypeUniversalRef, int Dims>
161-
auto submit_kernel_direct(
161+
auto submit_kernel_direct_parallel_for(
162162
const queue &Queue, PropertiesT Props, const nd_range<Dims> &Range,
163163
KernelTypeUniversalRef &&KernelFunc,
164164
const detail::code_location &CodeLoc = detail::code_location::current()) {
@@ -211,6 +211,53 @@ auto submit_kernel_direct(
211211
}
212212
}
213213

214+
template <typename KernelName = detail::auto_name, bool EventNeeded = false,
215+
typename PropertiesT, typename KernelTypeUniversalRef>
216+
auto submit_kernel_direct_single_task(
217+
const queue &Queue, PropertiesT Props, KernelTypeUniversalRef &&KernelFunc,
218+
const detail::code_location &CodeLoc = detail::code_location::current()) {
219+
// TODO Properties not supported yet
220+
(void)Props;
221+
static_assert(
222+
std::is_same_v<PropertiesT,
223+
ext::oneapi::experimental::empty_properties_t>,
224+
"Setting properties not supported yet for no-CGH kernel submit.");
225+
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
226+
227+
using KernelType =
228+
std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;
229+
230+
using NameT =
231+
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
232+
233+
detail::KernelWrapper<detail::WrapAs::single_task, NameT, KernelType,
234+
void, PropertiesT>::wrap(KernelFunc);
235+
236+
HostKernelRef<KernelType, KernelTypeUniversalRef, void, 1>
237+
HostKernel(std::forward<KernelTypeUniversalRef>(KernelFunc));
238+
239+
// Instantiating the kernel on the host improves debugging.
240+
// Passing this pointer to another translation unit prevents optimization.
241+
#ifndef NDEBUG
242+
// TODO: call library to prevent dropping call due to optimization
243+
(void)
244+
detail::GetInstantiateKernelOnHostPtr<KernelType, void, 1>();
245+
#endif
246+
247+
detail::DeviceKernelInfo *DeviceKernelInfoPtr =
248+
&detail::getDeviceKernelInfo<NameT>();
249+
250+
if constexpr (EventNeeded) {
251+
return submit_kernel_direct_with_event_impl(
252+
Queue, nd_range<1>{1, 1}, HostKernel, DeviceKernelInfoPtr,
253+
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
254+
} else {
255+
submit_kernel_direct_without_event_impl(
256+
Queue, nd_range<1>{1, 1}, HostKernel, DeviceKernelInfoPtr,
257+
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
258+
}
259+
}
260+
214261
} // namespace detail
215262

216263
namespace ext ::oneapi ::experimental {
@@ -2720,14 +2767,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
27202767
void(kernel_handler)>::value),
27212768
"sycl::queue.single_task() requires a kernel instead of command group. "
27222769
"Use queue.submit() instead");
2723-
2770+
/*
27242771
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
27252772
return submit(
27262773
[&](handler &CGH) {
27272774
CGH.template single_task<KernelName, KernelType, PropertiesT>(
27282775
Properties, KernelFunc);
27292776
},
27302777
TlsCodeLocCapture.query());
2778+
*/
2779+
2780+
(void)Properties;
2781+
return detail::submit_kernel_direct_single_task<KernelName, true>(
2782+
*this, ext::oneapi::experimental::empty_properties_t{},
2783+
KernelFunc, CodeLoc);
2784+
27312785
}
27322786

27332787
/// single_task version with a kernel represented as a lambda.
@@ -3275,7 +3329,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
32753329
parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
32763330
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
32773331
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
3278-
32793332
using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;
32803333

32813334
// TODO The handler-less path does not support reductions and kernel
@@ -3284,7 +3337,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
32843337
!(ext::oneapi::experimental::detail::
32853338
HasKernelPropertiesGetMethod<
32863339
const KernelType &>::value)) {
3287-
return detail::submit_kernel_direct<KernelName, true>(
3340+
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
32883341
*this, ext::oneapi::experimental::empty_properties_t{}, Range,
32893342
Rest...);
32903343
} else {

sycl/source/detail/graph/graph_impl.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,13 @@ graph_impl::add(std::shared_ptr<dynamic_command_group_impl> &DynCGImpl,
549549
return NodeImpl;
550550
}
551551

552+
std::shared_ptr<sycl::detail::queue_impl> graph_impl::getQueue() const {
553+
std::shared_ptr<sycl::detail::queue_impl> Return{};
554+
if (!MRecordingQueues.empty())
555+
Return = MRecordingQueues.begin()->lock();
556+
return Return;
557+
}
558+
552559
void graph_impl::addQueue(sycl::detail::queue_impl &RecordingQueue) {
553560
MRecordingQueues.insert(RecordingQueue.weak_from_this());
554561
}
@@ -870,10 +877,6 @@ exec_graph_impl::exec_graph_impl(sycl::context Context,
870877
const std::shared_ptr<graph_impl> &GraphImpl,
871878
const property_list &PropList)
872879
: MSchedule(), MGraphImpl(GraphImpl), MSyncPoints(),
873-
MQueueImpl(sycl::detail::queue_impl::create(
874-
*sycl::detail::getSyclObjImpl(GraphImpl->getDevice()),
875-
*sycl::detail::getSyclObjImpl(Context), sycl::async_handler{},
876-
sycl::property_list{})),
877880
MDevice(GraphImpl->getDevice()), MContext(Context), MRequirements(),
878881
MSchedulerDependencies(),
879882
MIsUpdatable(PropList.has_property<property::graph::updatable>()),
@@ -893,6 +896,15 @@ exec_graph_impl::exec_graph_impl(sycl::context Context,
893896
}
894897
// Copy nodes from GraphImpl and merge any subgraph nodes into this graph.
895898
duplicateNodes();
899+
900+
if (auto PlaceholderQueuePtr = GraphImpl->getQueue()) {
901+
MQueueImpl = PlaceholderQueuePtr;
902+
} else {
903+
MQueueImpl = sycl::detail::queue_impl::create(
904+
*sycl::detail::getSyclObjImpl(GraphImpl->getDevice()),
905+
*sycl::detail::getSyclObjImpl(Context), sycl::async_handler{},
906+
sycl::property_list{});
907+
}
896908
}
897909

898910
exec_graph_impl::~exec_graph_impl() {

sycl/source/detail/graph/graph_impl.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
172172
node_impl &add(std::shared_ptr<dynamic_command_group_impl> &DynCGImpl,
173173
nodes_range Deps);
174174

175+
std::shared_ptr<sycl::detail::queue_impl> getQueue() const;
176+
175177
/// Add a queue to the set of queues which are currently recording to this
176178
/// graph.
177179
/// @param RecordingQueue Queue to add to set.

sycl/source/detail/queue_impl.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,14 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
662662
: true) &&
663663
!hasCommandGraph();
664664

665+
if (isInOrder()) {
666+
if (SchedulerBypass) {
667+
MNoLastEventMode.store(true, std::memory_order_relaxed);
668+
} else {
669+
MNoLastEventMode.store(false, std::memory_order_relaxed);
670+
}
671+
}
672+
665673
EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);
666674

667675
// Sync with the last event for in order queue. For scheduler-bypass flow,

sycl/source/interop_handle.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,9 @@ ur_native_handle_t interop_handle::getNativeContext() const {
6060

6161
ur_native_handle_t
6262
interop_handle::getNativeQueue(int32_t &NativeHandleDesc) const {
63-
return MQueue->getNative(NativeHandleDesc);
63+
if (MQueue != nullptr)
64+
return MQueue->getNative(NativeHandleDesc);
65+
return 0;
6466
}
6567

6668
ur_native_handle_t interop_handle::getNativeGraph() const {

sycl/test-e2e/SubGroup/sub_group_as_vec.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ int main(int argc, char *argv[]) {
3333
queue.submit([&](sycl::handler &cgh) {
3434
auto global = buf.get_access<sycl::access::mode::read_write,
3535
sycl::access::target::device>(cgh);
36-
#ifdef DUSE_DEPRECATED_LOCAL_ACC
36+
#ifdef USE_DEPRECATED_LOCAL_ACC
3737
sycl::accessor<sycl::vec<int, 2>, 1, sycl::access::mode::read_write,
3838
sycl::access::target::local>
3939
local(N, cgh);

0 commit comments

Comments
 (0)