Skip to content
21 changes: 15 additions & 6 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ event handler::finalize() {

// TODO checking the size of the events vector and avoiding the call is more
// efficient here at this point
const bool KernelFastPath =
const bool KernelSchedulerBypass =
(Queue && !Graph && !impl->MSubgraphNode && !Queue->hasCommandGraph() &&
!impl->CGData.MRequirements.size() && !MStreamStorage.size() &&
(impl->CGData.MEvents.size() == 0 ||
Expand All @@ -521,7 +521,7 @@ event handler::finalize() {
// Extract arguments from the kernel lambda, if required.
// Skipping this is currently limited to simple kernels on the fast path.
if (type == detail::CGType::Kernel && impl->MKernelData.getKernelFuncPtr() &&
(!KernelFastPath || impl->MKernelData.hasSpecialCaptures())) {
(!KernelSchedulerBypass || impl->MKernelData.hasSpecialCaptures())) {
impl->MKernelData.extractArgsAndReqsFromLambda();
}

Expand Down Expand Up @@ -633,7 +633,7 @@ event handler::finalize() {
}
}

if (KernelFastPath) {
if (KernelSchedulerBypass) {
// if user does not add a new dependency to the dependency graph, i.e.
// the graph is not changed, then this faster path is used to submit
// kernel bypassing scheduler and avoiding CommandGroup, Command objects
Expand Down Expand Up @@ -879,9 +879,18 @@ event handler::finalize() {
#endif
}

bool DiscardEvent = !impl->MEventNeeded && Queue &&
Queue->supportsDiscardingPiEvents() &&
CommandGroup->getRequirements().size() == 0;
// For kernel submission, regardless of whether an event has been requested,
// the scheduler needs to generate an event so the commands are properly
// ordered (for in-order queue) and synchronized with a barrier (for
// out-of-order queue). The event can only be skipped for the scheduler bypass
// path.
//
// For commands other than kernel submission, if an event has not been
// requested, the queue supports events discarding, and the scheduler
// might have been bypassed (not supported yet), the event can be skipped.
bool DiscardEvent =
(type != detail::CGType::Kernel && KernelSchedulerBypass &&
!impl->MEventNeeded && Queue->supportsDiscardingPiEvents());

detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
std::move(CommandGroup), *Queue, !DiscardEvent);
Expand Down
103 changes: 84 additions & 19 deletions sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

using namespace sycl;

namespace oneapiext = ext::oneapi::experimental;

size_t GEventsWaitCounter = 0;

inline ur_result_t redefinedEventsWaitWithBarrier(void *pParams) {
Expand Down Expand Up @@ -49,20 +51,20 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) {
}

enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 };
std::vector<std::pair<CommandType, size_t>> ExecutedCommands;
std::vector<std::tuple<CommandType, size_t, size_t>> ExecutedCommands;

inline ur_result_t customEnqueueKernelLaunchWithArgsExp(void *pParams) {
auto params =
*static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
ExecutedCommands.push_back(
{CommandType::KERNEL, *params.pnumEventsInWaitList});
ExecutedCommands.push_back({CommandType::KERNEL, *params.pnumEventsInWaitList,
**params.ppGlobalWorkSize});
return UR_RESULT_SUCCESS;
}

inline ur_result_t customEnqueueUSMFill(void *pParams) {
auto params = *static_cast<ur_enqueue_usm_fill_params_t *>(pParams);
ExecutedCommands.push_back(
{CommandType::MEMSET, *params.pnumEventsInWaitList});
{CommandType::MEMSET, *params.pnumEventsInWaitList, 0});
return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -92,6 +94,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) {
CGH.host_task([&] {
std::unique_lock<std::mutex> lk(CvMutex);
Cv.wait(lk, [&ready] { return ready; });
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
});
});

Expand All @@ -113,11 +116,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) {

InOrderQueue.wait();

ASSERT_EQ(ExecutedCommands.size(), 2u);
EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
ASSERT_EQ(ExecutedCommands.size(), 3u);
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
CommandType::HOST_TASK);
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
CommandType::MEMSET);
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
}

TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
Expand All @@ -142,6 +149,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
CGH.host_task([&] {
std::unique_lock<std::mutex> lk(CvMutex);
Cv.wait(lk, [&ready] { return ready; });
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
});
});

Expand All @@ -159,11 +167,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {

InOrderQueue.wait();

ASSERT_EQ(ExecutedCommands.size(), 2u);
EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
ASSERT_EQ(ExecutedCommands.size(), 3u);
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
CommandType::HOST_TASK);
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
CommandType::MEMSET);
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
}

TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
Expand All @@ -186,7 +198,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
CGH.host_task([&] {
std::unique_lock<std::mutex> lk(CvMutex);
Cv.wait(lk, [&ready] { return ready; });
ExecutedCommands.push_back({CommandType::HOST_TASK, 0});
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
});
});

Expand All @@ -202,8 +214,61 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
InOrderQueue.wait();

ASSERT_EQ(ExecutedCommands.size(), 2u);
EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::HOST_TASK);
EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
CommandType::HOST_TASK);
EXPECT_EQ(std::get<1>(ExecutedCommands[0]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
}

TEST_F(SchedulerTest, InOrderQueueCrossDepsEnqueueFunctions) {
ExecutedCommands.clear();
sycl::unittest::UrMock<> Mock;
mock::getCallbacks().set_before_callback(
"urEnqueueKernelLaunchWithArgsExp",
&customEnqueueKernelLaunchWithArgsExp);

sycl::platform Plt = sycl::platform();

context Ctx{Plt};
queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()};

std::mutex CvMutex;
std::condition_variable Cv;
bool ready = false;

InOrderQueue.submit([&](sycl::handler &CGH) {
CGH.host_task([&] {
std::unique_lock<std::mutex> lk(CvMutex);
Cv.wait(lk, [&ready] { return ready; });
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
});
});

oneapiext::nd_launch<TestKernel>(
InOrderQueue, nd_range<1>{range<1>{32}, range<1>{32}}, [](nd_item<1>) {});

oneapiext::nd_launch<TestKernel>(
InOrderQueue, nd_range<1>{range<1>{64}, range<1>{32}}, [](nd_item<1>) {});

{
std::unique_lock<std::mutex> lk(CvMutex);
ready = true;
}
Cv.notify_one();

InOrderQueue.wait();

ASSERT_EQ(ExecutedCommands.size(), 3u);
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
CommandType::HOST_TASK);
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<2>(ExecutedCommands[1]) /*GlobalWorkSize*/, 32u);
EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<2>(ExecutedCommands[2]) /*GlobalWorkSize*/, 64u);
}