Skip to content
21 changes: 15 additions & 6 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ event handler::finalize() {

// TODO checking the size of the events vector and avoiding the call is more
// efficient here at this point
const bool KernelFastPath =
const bool SchedulerBypassPath =
(Queue && !Graph && !impl->MSubgraphNode && !Queue->hasCommandGraph() &&
!impl->CGData.MRequirements.size() && !MStreamStorage.size() &&
(impl->CGData.MEvents.size() == 0 ||
Expand All @@ -521,7 +521,7 @@ event handler::finalize() {
// Extract arguments from the kernel lambda, if required.
// Skipping this is currently limited to simple kernels on the fast path.
if (type == detail::CGType::Kernel && impl->MKernelData.getKernelFuncPtr() &&
(!KernelFastPath || impl->MKernelData.hasSpecialCaptures())) {
(!SchedulerBypassPath || impl->MKernelData.hasSpecialCaptures())) {
impl->MKernelData.extractArgsAndReqsFromLambda();
}

Expand Down Expand Up @@ -633,7 +633,7 @@ event handler::finalize() {
}
}

if (KernelFastPath) {
if (SchedulerBypassPath) {
// if user does not add a new dependency to the dependency graph, i.e.
// the graph is not changed, then this faster path is used to submit
// kernel bypassing scheduler and avoiding CommandGroup, Command objects
Expand Down Expand Up @@ -879,9 +879,18 @@ event handler::finalize() {
#endif
}

bool DiscardEvent = !impl->MEventNeeded && Queue &&
Queue->supportsDiscardingPiEvents() &&
CommandGroup->getRequirements().size() == 0;
// For kernel submission, regardless of whether an event has been requested,
// the scheduler needs to generate an event so the commands are properly
// ordered (for in-order queue) and synchronized with a barrier (for
// out-of-order queue). The event can only be skipped for the scheduler bypass
// path.
//
// For commands other than kernel submission, if an event has not been
// requested, the queue supports events discarding, and the scheduler
// might have been bypassed (not supported yet), the event can be skipped.
bool DiscardEvent =
(type != detail::CGType::Kernel && SchedulerBypassPath &&
!impl->MEventNeeded && Queue->supportsDiscardingPiEvents());

detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
std::move(CommandGroup), *Queue, !DiscardEvent);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ inline ur_result_t after_urKernelGetInfo(void *pParams) {
static thread_local size_t counter_urEnqueueKernelLaunch = 0;
inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) {
++counter_urEnqueueKernelLaunch;
auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
EXPECT_EQ(*params.pphEvent, nullptr);
return UR_RESULT_SUCCESS;
}

Expand All @@ -42,32 +40,24 @@ inline ur_result_t redefined_urEnqueueKernelLaunchWithEvent(void *pParams) {
static thread_local size_t counter_urUSMEnqueueMemcpy = 0;
inline ur_result_t redefined_urUSMEnqueueMemcpy(void *pParams) {
++counter_urUSMEnqueueMemcpy;
auto params = *static_cast<ur_enqueue_usm_memcpy_params_t *>(pParams);
EXPECT_EQ(*params.pphEvent, nullptr);
return UR_RESULT_SUCCESS;
}

static thread_local size_t counter_urUSMEnqueueFill = 0;
inline ur_result_t redefined_urUSMEnqueueFill(void *pParams) {
++counter_urUSMEnqueueFill;
auto params = *static_cast<ur_enqueue_usm_fill_params_t *>(pParams);
EXPECT_EQ(*params.pphEvent, nullptr);
return UR_RESULT_SUCCESS;
}

static thread_local size_t counter_urUSMEnqueuePrefetch = 0;
inline ur_result_t redefined_urUSMEnqueuePrefetch(void *pParams) {
++counter_urUSMEnqueuePrefetch;
auto params = *static_cast<ur_enqueue_usm_prefetch_params_t *>(pParams);
EXPECT_EQ(*params.pphEvent, nullptr);
return UR_RESULT_SUCCESS;
}

static thread_local size_t counter_urUSMEnqueueMemAdvise = 0;
inline ur_result_t redefined_urUSMEnqueueMemAdvise(void *pParams) {
++counter_urUSMEnqueueMemAdvise;
auto params = *static_cast<ur_enqueue_usm_advise_params_t *>(pParams);
EXPECT_EQ(*params.pphEvent, nullptr);
return UR_RESULT_SUCCESS;
}

Expand Down
102 changes: 83 additions & 19 deletions sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

using namespace sycl;

namespace oneapiext = ext::oneapi::experimental;

size_t GEventsWaitCounter = 0;

inline ur_result_t redefinedEventsWaitWithBarrier(void *pParams) {
Expand Down Expand Up @@ -49,19 +51,19 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) {
}

enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 };
std::vector<std::pair<CommandType, size_t>> ExecutedCommands;
std::vector<std::tuple<CommandType, size_t, size_t>> ExecutedCommands;

inline ur_result_t customEnqueueKernelLaunch(void *pParams) {
auto params = *static_cast<ur_enqueue_kernel_launch_params_t *>(pParams);
ExecutedCommands.push_back(
{CommandType::KERNEL, *params.pnumEventsInWaitList});
ExecutedCommands.push_back({CommandType::KERNEL, *params.pnumEventsInWaitList,
**params.ppGlobalWorkSize});
return UR_RESULT_SUCCESS;
}

inline ur_result_t customEnqueueUSMFill(void *pParams) {
auto params = *static_cast<ur_enqueue_usm_fill_params_t *>(pParams);
ExecutedCommands.push_back(
{CommandType::MEMSET, *params.pnumEventsInWaitList});
{CommandType::MEMSET, *params.pnumEventsInWaitList, 0});
return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -90,6 +92,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) {
CGH.host_task([&] {
std::unique_lock<std::mutex> lk(CvMutex);
Cv.wait(lk, [&ready] { return ready; });
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
});
});

Expand All @@ -111,11 +114,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) {

InOrderQueue.wait();

ASSERT_EQ(ExecutedCommands.size(), 2u);
EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
ASSERT_EQ(ExecutedCommands.size(), 3u);
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
CommandType::HOST_TASK);
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
CommandType::MEMSET);
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
}

TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
Expand All @@ -139,6 +146,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
CGH.host_task([&] {
std::unique_lock<std::mutex> lk(CvMutex);
Cv.wait(lk, [&ready] { return ready; });
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
});
});

Expand All @@ -156,11 +164,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {

InOrderQueue.wait();

ASSERT_EQ(ExecutedCommands.size(), 2u);
EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
ASSERT_EQ(ExecutedCommands.size(), 3u);
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
CommandType::HOST_TASK);
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
CommandType::MEMSET);
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
}

TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
Expand All @@ -182,7 +194,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
CGH.host_task([&] {
std::unique_lock<std::mutex> lk(CvMutex);
Cv.wait(lk, [&ready] { return ready; });
ExecutedCommands.push_back({CommandType::HOST_TASK, 0});
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
});
});

Expand All @@ -198,8 +210,60 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
InOrderQueue.wait();

ASSERT_EQ(ExecutedCommands.size(), 2u);
EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::HOST_TASK);
EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
CommandType::HOST_TASK);
EXPECT_EQ(std::get<1>(ExecutedCommands[0]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
}

TEST_F(SchedulerTest, InOrderQueueCrossDepsEnqueueFunctions) {
ExecutedCommands.clear();
sycl::unittest::UrMock<> Mock;
mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch",
&customEnqueueKernelLaunch);

sycl::platform Plt = sycl::platform();

context Ctx{Plt};
queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()};

std::mutex CvMutex;
std::condition_variable Cv;
bool ready = false;

InOrderQueue.submit([&](sycl::handler &CGH) {
CGH.host_task([&] {
std::unique_lock<std::mutex> lk(CvMutex);
Cv.wait(lk, [&ready] { return ready; });
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
});
});

oneapiext::nd_launch<TestKernel>(
InOrderQueue, nd_range<1>{range<1>{32}, range<1>{32}}, [](nd_item<1>) {});

oneapiext::nd_launch<TestKernel>(
InOrderQueue, nd_range<1>{range<1>{64}, range<1>{32}}, [](nd_item<1>) {});

{
std::unique_lock<std::mutex> lk(CvMutex);
ready = true;
}
Cv.notify_one();

InOrderQueue.wait();

ASSERT_EQ(ExecutedCommands.size(), 3u);
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
CommandType::HOST_TASK);
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<2>(ExecutedCommands[1]) /*GlobalWorkSize*/, 32u);
EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
CommandType::KERNEL);
EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
EXPECT_EQ(std::get<2>(ExecutedCommands[2]) /*GlobalWorkSize*/, 64u);
}
Loading