Skip to content

Commit 4369958

Browse files
committed
Merge branch 'sycl' into msan-always-reserve-origin-mem
2 parents e6f9868 + 2ae009e commit 4369958

File tree

6 files changed

+203
-47
lines changed

6 files changed

+203
-47
lines changed

sycl/doc/design/SYCLNativeCPU.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,20 @@ in order to use a local checkout of the oneAPI Construction Kit. The CMake varia
6262

6363
The SYCL Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`.
6464

65+
### oneTBB integration
66+
67+
SYCL Native CPU can use oneTBB as an optional backend for task scheduling. oneTBB with SYCL Native CPU is enabled by setting `NATIVECPU_WITH_ONETBB=On` at configure time:
68+
69+
```
70+
python3 buildbot/configure.py \
71+
--native_cpu \
72+
--cmake-opt=-DNATIVECPU_WITH_ONETBB=On
73+
```
74+
75+
This will pull oneTBB into SYCL Native CPU via CMake `FetchContent` and DPC++ can be built as usual.
76+
77+
By default SYCL Native CPU implements its own scheduler whose only dependency is standard C++.
78+
6579
# Supported features and current limitations
6680

6781
The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barriers` are supported only when the oneAPI Construction Kit integration is enabled, several math builtins are not supported and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](https://github.com/intel/llvm/blob/sycl/sycl/test/native_cpu).

unified-runtime/source/adapters/native_cpu/CMakeLists.txt

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME}
3737
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
3838
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
3939
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
40+
${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp
4041
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
4142
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
4243
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
@@ -51,6 +52,34 @@ set_target_properties(${TARGET_NAME} PROPERTIES
5152
SOVERSION "${PROJECT_VERSION_MAJOR}"
5253
)
5354

55+
# oneTBB is used as an optional NativeCPU backend and disabled by default.
56+
option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF)
57+
if(NATIVECPU_WITH_ONETBB)
58+
message(STATUS "Configuring Native CPU adapter with oneTBB backend.")
59+
60+
include(FetchContent)
61+
FetchContent_Declare(
62+
tbb
63+
GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git
64+
# commit 4e4fffed4fb86ae0960a3364700f549b539c777e (HEAD -> master, origin/master, origin/HEAD)
65+
# Author: Ilya Isaev <[email protected]>
66+
# Date: Mon Aug 18 10:35:26 2025 +0200
67+
# Improve task_arena interoperability with task_groups (#1784)
68+
GIT_TAG 4e4fffed4fb86ae0960a3364700f549b539c777e
69+
CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
70+
OVERRIDE_FIND_PACKAGE
71+
)
72+
set(TBB_TEST OFF CACHE INTERNAL "" FORCE)
73+
set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE)
74+
set(TBB_BENCH OFF CACHE INTERNAL "" FORCE)
75+
set(TBB_BUILD ON CACHE INTERNAL "" FORCE)
76+
set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE)
77+
set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE)
78+
set(TBB_INSTALL ON CACHE INTERNAL "" FORCE)
79+
set (CMAKE_INCLUDE_CURRENT_DIR OFF)
80+
FetchContent_MakeAvailable(tbb)
81+
endif()
82+
5483
find_package(Threads REQUIRED)
5584

5685
target_link_libraries(${TARGET_NAME} PRIVATE
@@ -63,3 +92,23 @@ target_link_libraries(${TARGET_NAME} PRIVATE
6392
target_include_directories(${TARGET_NAME} PRIVATE
6493
"${CMAKE_CURRENT_SOURCE_DIR}/../../"
6594
)
95+
96+
if(NATIVECPU_WITH_ONETBB)
97+
target_link_libraries(${TARGET_NAME} PRIVATE
98+
TBB::tbb
99+
)
100+
if (NOT MSVC)
101+
# oneTBB currently casts away some const qualifiers
102+
# todo: check if compiler actually supports these options
103+
target_compile_options(tbb PRIVATE -Wno-cast-qual -Wno-stringop-overflow -Wno-unknown-warning-option)
104+
target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
105+
endif()
106+
107+
# Undefine _DEBUG option in release builds to find
108+
# release tbbbind
109+
if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
110+
target_compile_options(tbb PRIVATE -U_DEBUG)
111+
endif()
112+
113+
target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB)
114+
endif()

unified-runtime/source/adapters/native_cpu/enqueue.cpp

100644100755
Lines changed: 49 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,18 @@ class WaitInfo {
7070
}
7171
};
7272

73+
template <class T>
7374
inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList,
74-
const ur_event_handle_t *phEventWaitList) {
75+
const ur_event_handle_t *phEventWaitList,
76+
const T &scheduler) {
77+
if (numEventsInWaitList && !scheduler.CanWaitInThread()) {
78+
// Waiting for dependent events in threads launched by the enqueue may
79+
// not work correctly for some backend/schedulers, so we have the safe
80+
// option here to wait in the main thread instead (potentially at the
81+
// expense of performance).
82+
urEventWait(numEventsInWaitList, phEventWaitList);
83+
numEventsInWaitList = 0;
84+
}
7585
return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList);
7686
}
7787

@@ -151,7 +161,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
151161

152162
auto &tp = hQueue->getDevice()->tp;
153163
const size_t numParallelThreads = tp.num_threads();
154-
std::vector<std::future<void>> futures;
164+
auto Tasks = native_cpu::getScheduler(tp);
155165
auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
156166
auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
157167
auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
@@ -162,7 +172,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
162172
auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
163173
kernel->updateMemPool(numParallelThreads);
164174

165-
auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
175+
auto InEvents =
176+
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);
166177

167178
const size_t numWG = numWG0 * numWG1 * numWG2;
168179
const size_t numWGPerThread = numWG / numParallelThreads;
@@ -177,42 +188,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
177188
rangeEnd[0] = rangeEnd[3] % numWG0;
178189
rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1;
179190
rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1);
180-
futures.emplace_back(tp.schedule_task(
181-
[ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3],
182-
numWG0, numWG1, numParallelThreads](size_t threadId) {
183-
auto state = getState(ndr);
184-
InEvents.wait();
185-
for (size_t g0 = rangeStart[0], g1 = rangeStart[1],
186-
g2 = rangeStart[2], g3 = rangeStart[3];
187-
g3 < rangeEnd; ++g3) {
191+
Tasks.schedule([ndr, InEvents, &kernel = *kernel, rangeStart,
192+
rangeEnd = rangeEnd[3], numWG0, numWG1,
193+
numParallelThreads](size_t threadId) {
194+
auto state = getState(ndr);
195+
InEvents.wait();
196+
for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2],
197+
g3 = rangeStart[3];
198+
g3 < rangeEnd; ++g3) {
188199
#ifdef NATIVECPU_USE_OCK
189-
state.update(g0, g1, g2);
190-
kernel._subhandler(
191-
kernel.getArgs(numParallelThreads, threadId).data(), &state);
200+
state.update(g0, g1, g2);
201+
kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(),
202+
&state);
192203
#else
193-
for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
194-
for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
195-
for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
196-
state.update(g0, g1, g2, local0, local1, local2);
197-
kernel._subhandler(
198-
kernel.getArgs(numParallelThreads, threadId).data(),
199-
&state);
200-
}
201-
}
204+
for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
205+
for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
206+
for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
207+
state.update(g0, g1, g2, local0, local1, local2);
208+
kernel._subhandler(
209+
kernel.getArgs(numParallelThreads, threadId).data(), &state);
202210
}
211+
}
212+
}
203213
#endif
204-
if (++g0 == numWG0) {
205-
g0 = 0;
206-
if (++g1 == numWG1) {
207-
g1 = 0;
208-
++g2;
209-
}
210-
}
214+
if (++g0 == numWG0) {
215+
g0 = 0;
216+
if (++g1 == numWG1) {
217+
g1 = 0;
218+
++g2;
211219
}
212-
}));
220+
}
221+
}
222+
});
213223
rangeStart = rangeEnd;
214224
}
215-
event->set_futures(futures);
225+
event->set_tasksinfo(Tasks.getMovedTaskInfo());
216226

217227
if (phEvent) {
218228
*phEvent = event;
@@ -248,14 +258,14 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
248258
return result;
249259
}
250260
auto &tp = hQueue->getDevice()->tp;
251-
std::vector<std::future<void>> futures;
261+
auto Tasks = native_cpu::getScheduler(tp);
252262
auto InEvents =
253-
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
254-
futures.emplace_back(tp.schedule_task([f, InEvents](size_t) {
263+
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);
264+
Tasks.schedule([f, InEvents](size_t) {
255265
InEvents.wait();
256266
f();
257-
}));
258-
event->set_futures(futures);
267+
});
268+
event->set_tasksinfo(Tasks.getMovedTaskInfo());
259269
event->set_callback(
260270
[event, InEvents = InEvents.getUniquePtr()]() { event->tick_end(); });
261271
return UR_RESULT_SUCCESS;
@@ -465,7 +475,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
465475
// TODO: error checking
466476
// TODO: handle async
467477
void *startingPtr = hBuffer->_mem + offset;
468-
unsigned steps = size / patternSize;
478+
size_t steps = size / patternSize;
469479
for (unsigned i = 0; i < steps; i++) {
470480
memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
471481
patternSize);
@@ -575,7 +585,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
575585
break;
576586
}
577587
default: {
578-
for (unsigned int step{0}; step < size; step += patternSize) {
588+
for (size_t step{0}; step < size; step += patternSize) {
579589
auto *dest = reinterpret_cast<void *>(
580590
reinterpret_cast<uint8_t *>(ptr) + step);
581591
memcpy(dest, pPattern, patternSize);

unified-runtime/source/adapters/native_cpu/event.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "ur_api.h"
1212

1313
#include "common.hpp"
14+
#include "device.hpp"
1415
#include "event.hpp"
1516
#include "queue.hpp"
1617
#include <cstdint>
@@ -111,7 +112,7 @@ urEnqueueTimestampRecordingExp(ur_queue_handle_t /*hQueue*/, bool /*blocking*/,
111112
ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue,
112113
ur_command_t command_type)
113114
: queue(queue), context(queue->getContext()), command_type(command_type),
114-
done(false) {
115+
done(false), tasksinfo(queue->getDevice()->tp) {
115116
this->queue->addEvent(this);
116117
}
117118

@@ -126,9 +127,7 @@ void ur_event_handle_t_::wait() {
126127
if (done) {
127128
return;
128129
}
129-
for (auto &f : futures) {
130-
f.wait();
131-
}
130+
this->tasksinfo.wait_all();
132131
queue->removeEvent(this);
133132
done = true;
134133
// The callback may need to acquire the lock, so we unlock it here

unified-runtime/source/adapters/native_cpu/event.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//===----------------------------------------------------------------------===//
1010
#pragma once
1111
#include "common.hpp"
12+
#include "threadpool.hpp"
1213
#include "ur_api.h"
1314
#include <cstdint>
1415
#include <future>
@@ -42,9 +43,9 @@ struct ur_event_handle_t_ : RefCounted {
4243

4344
ur_command_t getCommandType() const { return command_type; }
4445

45-
void set_futures(std::vector<std::future<void>> &fs) {
46+
void set_tasksinfo(native_cpu::tasksinfo_t &&fs) {
4647
std::lock_guard<std::mutex> lock(mutex);
47-
futures = std::move(fs);
48+
tasksinfo = std::move(fs);
4849
}
4950

5051
void tick_start();
@@ -61,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted {
6162
ur_command_t command_type;
6263
bool done;
6364
std::mutex mutex;
64-
std::vector<std::future<void>> futures;
65+
native_cpu::tasksinfo_t tasksinfo;
6566
std::packaged_task<void()> callback;
6667
uint64_t timestamp_start = 0;
6768
uint64_t timestamp_end = 0;

unified-runtime/source/adapters/native_cpu/threadpool.hpp

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,90 @@ template <typename ThreadPoolT> class threadpool_interface {
207207
return ret;
208208
}
209209
};
210+
using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;
210211

211-
using threadpool_t = threadpool_interface<detail::simple_thread_pool>;
212+
class TasksInfo_TP {
213+
using FType = std::future<void>;
214+
std::vector<FType> futures;
212215

216+
public:
217+
void schedule(FType &&f) { futures.emplace_back(std::move(f)); }
218+
void wait_all() {
219+
for (auto &f : futures)
220+
f.wait();
221+
}
222+
TasksInfo_TP(simple_threadpool_t &) {}
223+
};
224+
225+
template <class TP, class TaskInfo> struct Scheduler_base {
226+
TP &ref;
227+
TaskInfo ti;
228+
Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {}
229+
TaskInfo getMovedTaskInfo() { return std::move(ti); }
230+
static constexpr bool CanWaitInThread() { return true; }
231+
};
232+
233+
template <class TP> struct Scheduler : Scheduler_base<TP, TasksInfo_TP> {
234+
using Scheduler_base<TP, TasksInfo_TP>::Scheduler_base;
235+
236+
template <class T> void schedule(T &&task) {
237+
this->ti.schedule(this->ref.schedule_task(std::forward<T>(task)));
238+
}
239+
};
240+
241+
template <class TPType> inline Scheduler<TPType> getScheduler(TPType &tp) {
242+
return Scheduler<TPType>(tp);
243+
}
244+
245+
} // namespace native_cpu
246+
247+
#ifdef NATIVECPU_WITH_ONETBB
248+
// Simple TBB backend
249+
#include "oneapi/tbb.h"
250+
namespace native_cpu {
251+
252+
class TBB_threadpool {
253+
oneapi::tbb::task_group tasks;
254+
255+
public:
256+
void wait_all() { tasks.wait(); }
257+
oneapi::tbb::task_group &Tasks() { return tasks; }
258+
size_t num_threads() const noexcept {
259+
return oneapi::tbb::info::default_concurrency();
260+
}
261+
};
262+
263+
class TBB_TasksInfo {
264+
TBB_threadpool *tp;
265+
266+
public:
267+
void wait_all() { tp->wait_all(); }
268+
TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {}
269+
};
270+
271+
template <>
272+
struct Scheduler<TBB_threadpool>
273+
: Scheduler_base<TBB_threadpool, TBB_TasksInfo> {
274+
using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base;
275+
template <class T> void schedule(T &&task_) {
276+
ref.Tasks().run([task = std::move(task_)]() {
277+
auto thread_id = tbb::this_task_arena::current_thread_index();
278+
assert(thread_id >= 0 &&
279+
thread_id < oneapi::tbb::info::default_concurrency());
280+
task(thread_id);
281+
});
282+
}
283+
static constexpr bool CanWaitInThread() { return false; }
284+
};
285+
286+
using tasksinfo_t = TBB_TasksInfo;
287+
using threadpool_t = TBB_threadpool;
288+
} // namespace native_cpu
289+
290+
#else
291+
// The default backend
292+
namespace native_cpu {
293+
using tasksinfo_t = TasksInfo_TP;
294+
using threadpool_t = simple_threadpool_t;
213295
} // namespace native_cpu
296+
#endif

0 commit comments

Comments
 (0)