Skip to content
Merged
Show file tree
Hide file tree
Changes from 127 commits
Commits
Show all changes
134 commits
Select commit Hold shift + click to select a range
d4700e5
[NATIVECPU] faster enqueue for larger ranges
uwedolinsky Feb 27, 2025
b3f2215
[NATIVECPU] use size_t, reserve vector size
uwedolinsky Aug 28, 2024
780588c
[NATIVECPU] added threadpool file to CMakeList
uwedolinsky Oct 22, 2024
db924f0
[NATIVECPU] Simple TBB backend
uwedolinsky Oct 22, 2024
1509655
[NATIVECPU] more shared code
uwedolinsky Oct 23, 2024
45ee46c
[NATIVECPU] update oneTBB tag
uwedolinsky Oct 24, 2024
aa7dec8
[NATIVECPU] added required include not needed by Windows
uwedolinsky Oct 24, 2024
29d11f9
[NATIVECPU] added system headers first
uwedolinsky Oct 24, 2024
e202f8d
[NATIVECPU] cmake fix
uwedolinsky Oct 24, 2024
fe8d099
[NATIVECPU] removed GIT_SHALLOW
uwedolinsky Oct 25, 2024
c2a3f57
[NATIVECPU] turn CMAKE_INCLUDE_CURRENT_DIR off for tbb
uwedolinsky Nov 1, 2024
be5b134
[NATIVECPU] workaround for oneTBB casting away const qualifiers
uwedolinsky Nov 1, 2024
b18401f
[NATIVECPU] workaround for oneTBB casting away const qualifiers
uwedolinsky Nov 1, 2024
4bff038
[NATIVECPU] remove potentially unneeded cmake
uwedolinsky Nov 1, 2024
eacf522
[NATIVECPU] oneTBB disabled by default
uwedolinsky Nov 4, 2024
c2996eb
[NATIVECPU] tbb to oneTBB
uwedolinsky Nov 4, 2024
91a6a49
[NATIVECPU] improved comment
uwedolinsky Nov 4, 2024
c1745c7
[NATIVECPU] tbb to oneTBB
uwedolinsky Nov 4, 2024
488504c
[NATIVECPU] tbb to oneTBB
uwedolinsky Nov 4, 2024
53013d4
[NATIVECPU] num_threads with oneTBB
uwedolinsky Nov 4, 2024
e8d8ff4
[NATIVECPU] added comment to cmake
uwedolinsky Nov 6, 2024
99c76c9
[NATIVECPU] using old task ids with tbb (WIP)
uwedolinsky Nov 6, 2024
9b40081
[NATIVECPU] fixed merge from main
uwedolinsky Nov 13, 2024
07c178d
[NATIVECPU] fix merge with events update
uwedolinsky Nov 14, 2024
aee938a
[NATIVECPU] revert noise
uwedolinsky Nov 14, 2024
59d731a
[NATIVECPU] fix integer size warnings
uwedolinsky Nov 14, 2024
e0341ef
[NATIVECPU] update oneTBB tag
uwedolinsky Nov 26, 2024
e719ec0
[NATIVECPU] use oneTBB UXL github
uwedolinsky Nov 29, 2024
81c3c82
[NATIVECPU] undefine _DEBUG in release builds for tbb
uwedolinsky Dec 12, 2024
ecaf51b
[NATIVECPU] oneTBB bump
uwedolinsky Jan 27, 2025
f5d6547
[NATIVECPU] clang-format and removed one inline
uwedolinsky Jan 28, 2025
e975e77
[NATIVECPU] clang-format
uwedolinsky Jan 28, 2025
26a5bd0
[NATIVECPU] removed inline
uwedolinsky Jan 28, 2025
38a91f7
[NATIVECPU] renamed wait to wait_all
uwedolinsky Jan 28, 2025
b31bd44
[NATIVECPU] move
uwedolinsky Feb 3, 2025
960b1d5
[NATIVECPU] removed unused groups
uwedolinsky Feb 28, 2025
04bd48a
[NATIVECPU] added async memcpy
uwedolinsky Mar 27, 2025
45c76d9
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Mar 27, 2025
7985e95
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Apr 10, 2025
7008b8b
[NATIVECPU] added non-blocking invoker, removed unused variable
uwedolinsky Apr 11, 2025
a3f4ea0
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Apr 11, 2025
2f1b3fe
[NATIVECPU] waiting for events in threads
uwedolinsky Apr 16, 2025
d5aa0cf
[NATIVECPU] resolved merge
uwedolinsky Apr 16, 2025
8efb1e4
[NATIVECPU] ndrange enqueue with less work for main thread
uwedolinsky Apr 22, 2025
67e9995
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Apr 22, 2025
2c52186
[NATIVECPU] static_assert for pointer type
uwedolinsky Apr 22, 2025
57bff8e
[NATIVECPU] resolved merge
uwedolinsky Apr 22, 2025
5348490
[NATIVECPU] added anonymous namespace
uwedolinsky Apr 22, 2025
1de1251
[NATIVECPU] separated out Invokers for enqueues
uwedolinsky Apr 22, 2025
9173f5e
[NATIVECPU] made more memops async
uwedolinsky Apr 23, 2025
7cd7caa
[NATIVECPU] memop pointer check outside worker lambda
uwedolinsky Apr 23, 2025
849ba98
Merge remote-tracking branch 'origin/sycl' into uwe/nativecpu_eventswait
uwedolinsky Apr 23, 2025
32ecf09
[NATIVECPU] moved inEvents
uwedolinsky Apr 23, 2025
c77454e
[NATIVECPU] fixed merge
uwedolinsky Apr 23, 2025
6142549
Merge remote-tracking branch 'origin/uwe/nativecpu_eventswait' into u…
uwedolinsky Apr 24, 2025
4b05062
[NATIVECPU] use unique_ptr for WaitInfo
uwedolinsky Apr 28, 2025
2722cad
[NATIVECPU] async memcopy
uwedolinsky Apr 28, 2025
24a0da3
[NATIVECPU] fixed merge
uwedolinsky Apr 28, 2025
22898b4
[NATIVECPU] code reuse for memcopies
uwedolinsky Apr 28, 2025
bed18b6
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Apr 28, 2025
5d12b7a
[NATIVECPU] removed invoker
uwedolinsky Apr 29, 2025
400ba0d
[NATIVECPU] removed unneeded function
uwedolinsky Apr 29, 2025
40f7270
[NATIVECPU] async wait in noop copy
uwedolinsky Apr 29, 2025
bd161bc
[NATIVECPU] async membuffer ops
uwedolinsky Apr 29, 2025
870754a
[NATIVECPU] quick fix for in-order queues
uwedolinsky Apr 30, 2025
e11f596
[NATIVECPU] construct state inside thread
uwedolinsky Apr 30, 2025
b4069d1
[NATIVECPU] update comments
uwedolinsky May 1, 2025
e83715c
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky May 1, 2025
ee2d232
Merge remote-tracking branch 'origin/sycl' into uwe/onetbb_integratio…
uwedolinsky May 2, 2025
dfc67d8
[NATIVECPU] removed nullptr check for pHEventWaitList
uwedolinsky May 2, 2025
a25b2c7
[NATIVECPU] updated oneTBB tag
uwedolinsky May 2, 2025
3074b16
[NATIVECPU] removed unneeded mutable
uwedolinsky May 6, 2025
070f0cf
[NATIVECPU] moved lambda code from enqueueMemBufferReadWriteRect_impl…
uwedolinsky May 6, 2025
eb64e5d
[NATIVECPU] resolved merge with events
uwedolinsky May 6, 2025
3207ffa
[NATIVECPU] simplified event generation
uwedolinsky May 7, 2025
106a31f
[MNATIVECPU] fixed merge with async branch
uwedolinsky May 7, 2025
6e1f722
[NATIVECPU] added interface to disable waiting in threads (for oneTBB)
uwedolinsky May 7, 2025
2a557f9
[NATIVECPU] removed the now unneeded std::function wrapper for oneTBB
uwedolinsky May 7, 2025
29c201c
[NATIVECPU] revert accidental filemode change
uwedolinsky May 7, 2025
941932b
[NATIVECPU] replaced function pointer template parameter
uwedolinsky May 9, 2025
4c5700d
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky May 12, 2025
1532779
[NATIVECPU] simplified WaitInfo
uwedolinsky May 13, 2025
0204d11
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky May 21, 2025
ffe66d0
[NATIVECPU] added mutex to backend queue
uwedolinsky May 26, 2025
3505c76
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky May 28, 2025
c95ebe7
[NATIVECPU] renamed flag to lock mutex
uwedolinsky Jun 2, 2025
67d77da
Merge remote-tracking branch 'origin/sycl' into uwe/nativecpu_queuemutex
uwedolinsky Jun 2, 2025
73cf574
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Jun 4, 2025
6fcea0f
[NATIVECPU] launch ranges with number of work items that is multiple …
uwedolinsky Jun 4, 2025
788cf69
[NATIVECPU] merge with events branch
uwedolinsky Jun 5, 2025
d86f429
[NATIVECPU] used lock_guard
uwedolinsky Jun 5, 2025
ddb908f
[NATIVECPU] removed unused local
uwedolinsky Jun 5, 2025
22ab082
[NATIVECPU] fixed merge with uwe/nativecpu_queuemutex
uwedolinsky Jun 9, 2025
8b20c39
Merge remote-tracking branch 'origin/uwe/fasternativecpuenqueue_async…
uwedolinsky Jun 9, 2025
c57b68a
Merge remote-tracking branch 'origin/uwe/nativecpu_queuemutex' into u…
uwedolinsky Jun 9, 2025
1d62903
[NATIVECPU] removed reference captures in enqueue lambdas
uwedolinsky Jun 9, 2025
aced1a4
Merge remote-tracking branch 'origin/uwe/fasternativecpuenqueue_async…
uwedolinsky Jun 9, 2025
11ebe05
[NATIVECPU] bump oneTBB version
uwedolinsky Jun 11, 2025
666f2ae
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Jun 12, 2025
37ccfca
[NATIVECPU] added option to turn off waiting in threads for oneTBB
uwedolinsky Jun 12, 2025
50e0720
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Jul 3, 2025
5fcea55
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Jul 21, 2025
4ad9ee7
[NATIEVCPU] resolved merge
uwedolinsky Jul 23, 2025
2fd6b37
[NATIVECPU] resolved merge with sycl branch
uwedolinsky Jul 24, 2025
5e0b99d
[NATIVECPU] removed unneeded capture
uwedolinsky Jul 24, 2025
f05bba1
[NATIVECPU] removed mutable from task lambda
uwedolinsky Jul 24, 2025
8548f6a
[NATIVECPU] merge with uwe/fasternativecpuenqueue_async_ops_eventswait
uwedolinsky Jul 24, 2025
58ffb89
[NATIVECPU] clang-format
uwedolinsky Jul 24, 2025
aecf330
[NATIVECPU] merge with uwe/fasternativecpuenqueue_async_ops_eventswait
uwedolinsky Jul 24, 2025
f6b68dc
[NATIVECPU] clang-format
uwedolinsky Jul 24, 2025
faa03d2
[SYCL][NATIVECPU] update docs for oneTBB integration
uwedolinsky Nov 5, 2024
cfcc325
[SYCL][NATIVECPU] fixed heading for oneTBB integration
uwedolinsky Nov 5, 2024
159db63
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Jul 28, 2025
a26eb58
[NATIVECPU] removed unused code
uwedolinsky Jul 28, 2025
271cf93
[NATIVECPU] revert to size_t
uwedolinsky Jul 28, 2025
5784a93
[NATIVECPU] remove inline
uwedolinsky Jul 28, 2025
49942d2
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Jul 29, 2025
5bcb27c
Merge remote-tracking branch 'origin/uwe/fasternativecpuenqueue_async…
uwedolinsky Aug 5, 2025
6a6f19f
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Aug 6, 2025
ecf52a5
Merge remote-tracking branch 'origin/uwe/fasternativecpuenqueue_async…
uwedolinsky Aug 7, 2025
fc9b330
[NATIVECPU] resolved merge with sycl
uwedolinsky Aug 18, 2025
0e0b454
[NATIVECPU] remove comment
uwedolinsky Aug 18, 2025
a009bd2
[NATIVECPU] removed unused function
uwedolinsky Aug 18, 2025
02450dc
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Aug 19, 2025
a44fc99
[NATIVECPU] update oneTBB
uwedolinsky Aug 19, 2025
6a5f9d1
[NATIVECPU] add -Wno-stringop-overflow for oneTBB
uwedolinsky Aug 19, 2025
87f3e17
[NATIVECPU] add -Wno-unknown-warning-option for oneTBB
uwedolinsky Aug 19, 2025
f856e32
[NATIVECPU] comment added to getWaitInfo
uwedolinsky Aug 22, 2025
4251ba8
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Aug 25, 2025
e330c0d
[NATIVECPU] renamed futures/tasksinfo function for consistency
uwedolinsky Aug 25, 2025
4604d13
[NATIVECPU] renamed futures to tasksinfo for consistency
uwedolinsky Aug 26, 2025
320e55f
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Aug 26, 2025
3ecb6ab
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Aug 27, 2025
7f8bb75
Merge remote-tracking branch 'origin/sycl' into uwe/fasternativecpuen…
uwedolinsky Aug 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions sycl/doc/design/SYCLNativeCPU.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,20 @@ in order to use a local checkout of the oneAPI Construction Kit. The CMake varia

The SYCL Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`.

### oneTBB integration

SYCL Native CPU can use oneTBB as an optional backend for task scheduling. oneTBB with SYCL Native CPU is enabled by setting `NATIVECPU_WITH_ONETBB=On` at configure time:

```
python3 buildbot/configure.py \
--native_cpu \
--cmake-opt=-DNATIVECPU_WITH_ONETBB=On
```

This will pull oneTBB into SYCL Native CPU via CMake `FetchContent` and DPC++ can be built as usual.

By default SYCL Native CPU implements its own scheduler whose only dependency is standard C++.

# Supported features and current limitations

The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barriers` are supported only when the oneAPI Construction Kit integration is enabled, several math builtins are not supported and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](https://github.com/intel/llvm/blob/sycl/sycl/test/native_cpu).
Expand Down
49 changes: 49 additions & 0 deletions unified-runtime/source/adapters/native_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
Expand All @@ -51,6 +52,34 @@ set_target_properties(${TARGET_NAME} PROPERTIES
SOVERSION "${PROJECT_VERSION_MAJOR}"
)

# oneTBB is used as an optional NativeCPU backend and disabled by default.
option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF)
if(NATIVECPU_WITH_ONETBB)
message(STATUS "Configuring Native CPU adapter with oneTBB backend.")

include(FetchContent)
FetchContent_Declare(
tbb
GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git
# commit 4e4fffed4fb86ae0960a3364700f549b539c777e (HEAD -> master, origin/master, origin/HEAD)
# Author: Ilya Isaev <[email protected]>
# Date: Mon Aug 18 10:35:26 2025 +0200
# Improve task_arena interoperability with task_groups (#1784)
GIT_TAG 4e4fffed4fb86ae0960a3364700f549b539c777e
CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
OVERRIDE_FIND_PACKAGE
)
set(TBB_TEST OFF CACHE INTERNAL "" FORCE)
set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE)
set(TBB_BENCH OFF CACHE INTERNAL "" FORCE)
set(TBB_BUILD ON CACHE INTERNAL "" FORCE)
set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE)
set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE)
set(TBB_INSTALL ON CACHE INTERNAL "" FORCE)
set (CMAKE_INCLUDE_CURRENT_DIR OFF)
FetchContent_MakeAvailable(tbb)
endif()

find_package(Threads REQUIRED)

target_link_libraries(${TARGET_NAME} PRIVATE
Expand All @@ -63,3 +92,23 @@ target_link_libraries(${TARGET_NAME} PRIVATE
target_include_directories(${TARGET_NAME} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}/../../"
)

if(NATIVECPU_WITH_ONETBB)
target_link_libraries(${TARGET_NAME} PRIVATE
TBB::tbb
)
if (NOT MSVC)
# oneTBB currently casts away some const qualifiers
# todo: check if compiler actually supports these options
target_compile_options(tbb PRIVATE -Wno-cast-qual -Wno-stringop-overflow -Wno-unknown-warning-option)
target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
endif()

# Undefine _DEBUG option in release builds to find
# release tbbbind
if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
target_compile_options(tbb PRIVATE -U_DEBUG)
endif()

target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB)
endif()
84 changes: 45 additions & 39 deletions unified-runtime/source/adapters/native_cpu/enqueue.cpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,14 @@ class WaitInfo {
}
};

template <class T>
inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList) {
const ur_event_handle_t *phEventWaitList,
const T &scheduler) {
if (numEventsInWaitList && !scheduler.CanWaitInThread()) {
urEventWait(numEventsInWaitList, phEventWaitList);
numEventsInWaitList = 0;
}
return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList);
}

Expand Down Expand Up @@ -151,7 +157,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(

auto &tp = hQueue->getDevice()->tp;
const size_t numParallelThreads = tp.num_threads();
std::vector<std::future<void>> futures;
auto Tasks = native_cpu::getScheduler(tp);
auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
Expand All @@ -162,7 +168,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
kernel->updateMemPool(numParallelThreads);

auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
auto InEvents =
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);

const size_t numWG = numWG0 * numWG1 * numWG2;
const size_t numWGPerThread = numWG / numParallelThreads;
Expand All @@ -177,42 +184,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
rangeEnd[0] = rangeEnd[3] % numWG0;
rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1;
rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1);
futures.emplace_back(tp.schedule_task(
[ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3],
numWG0, numWG1, numParallelThreads](size_t threadId) {
auto state = getState(ndr);
InEvents.wait();
for (size_t g0 = rangeStart[0], g1 = rangeStart[1],
g2 = rangeStart[2], g3 = rangeStart[3];
g3 < rangeEnd; ++g3) {
Tasks.schedule([ndr, InEvents, &kernel = *kernel, rangeStart,
rangeEnd = rangeEnd[3], numWG0, numWG1,
numParallelThreads](size_t threadId) {
auto state = getState(ndr);
InEvents.wait();
for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2],
g3 = rangeStart[3];
g3 < rangeEnd; ++g3) {
#ifdef NATIVECPU_USE_OCK
state.update(g0, g1, g2);
kernel._subhandler(
kernel.getArgs(numParallelThreads, threadId).data(), &state);
state.update(g0, g1, g2);
kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(),
&state);
#else
for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
state.update(g0, g1, g2, local0, local1, local2);
kernel._subhandler(
kernel.getArgs(numParallelThreads, threadId).data(),
&state);
}
}
for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
state.update(g0, g1, g2, local0, local1, local2);
kernel._subhandler(
kernel.getArgs(numParallelThreads, threadId).data(), &state);
}
}
}
#endif
if (++g0 == numWG0) {
g0 = 0;
if (++g1 == numWG1) {
g1 = 0;
++g2;
}
}
if (++g0 == numWG0) {
g0 = 0;
if (++g1 == numWG1) {
g1 = 0;
++g2;
}
}));
}
}
});
rangeStart = rangeEnd;
}
event->set_futures(futures);
event->set_futures(Tasks.getTaskInfo());

if (phEvent) {
*phEvent = event;
Expand Down Expand Up @@ -248,14 +254,14 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
return result;
}
auto &tp = hQueue->getDevice()->tp;
std::vector<std::future<void>> futures;
auto Tasks = native_cpu::getScheduler(tp);
auto InEvents =
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
futures.emplace_back(tp.schedule_task([f, InEvents](size_t) {
native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);
Tasks.schedule([f, InEvents](size_t) {
InEvents.wait();
f();
}));
event->set_futures(futures);
});
event->set_futures(Tasks.getTaskInfo());
event->set_callback(
[event, InEvents = InEvents.getUniquePtr()]() { event->tick_end(); });
return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -466,7 +472,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
// TODO: error checking
// TODO: handle async
void *startingPtr = hBuffer->_mem + offset;
unsigned steps = size / patternSize;
size_t steps = size / patternSize;
for (unsigned i = 0; i < steps; i++) {
memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
patternSize);
Expand Down Expand Up @@ -576,7 +582,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
break;
}
default: {
for (unsigned int step{0}; step < size; step += patternSize) {
for (size_t step{0}; step < size; step += patternSize) {
auto *dest = reinterpret_cast<void *>(
reinterpret_cast<uint8_t *>(ptr) + step);
memcpy(dest, pPattern, patternSize);
Expand Down
7 changes: 3 additions & 4 deletions unified-runtime/source/adapters/native_cpu/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "ur_api.h"

#include "common.hpp"
#include "device.hpp"
#include "event.hpp"
#include "queue.hpp"
#include <cstdint>
Expand Down Expand Up @@ -111,7 +112,7 @@ urEnqueueTimestampRecordingExp(ur_queue_handle_t /*hQueue*/, bool /*blocking*/,
ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue,
ur_command_t command_type)
: queue(queue), context(queue->getContext()), command_type(command_type),
done(false) {
done(false), futures(queue->getDevice()->tp) {
this->queue->addEvent(this);
}

Expand All @@ -126,9 +127,7 @@ void ur_event_handle_t_::wait() {
if (done) {
return;
}
for (auto &f : futures) {
f.wait();
}
this->futures.wait_all();
queue->removeEvent(this);
done = true;
// The callback may need to acquire the lock, so we unlock it here
Expand Down
5 changes: 3 additions & 2 deletions unified-runtime/source/adapters/native_cpu/event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//===----------------------------------------------------------------------===//
#pragma once
#include "common.hpp"
#include "threadpool.hpp"
#include "ur_api.h"
#include <cstdint>
#include <future>
Expand Down Expand Up @@ -42,7 +43,7 @@ struct ur_event_handle_t_ : RefCounted {

ur_command_t getCommandType() const { return command_type; }

void set_futures(std::vector<std::future<void>> &fs) {
void set_futures(native_cpu::tasksinfo_t &&fs) {
std::lock_guard<std::mutex> lock(mutex);
futures = std::move(fs);
}
Expand All @@ -61,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted {
ur_command_t command_type;
bool done;
std::mutex mutex;
std::vector<std::future<void>> futures;
native_cpu::tasksinfo_t futures;
std::packaged_task<void()> callback;
uint64_t timestamp_start = 0;
uint64_t timestamp_end = 0;
Expand Down
85 changes: 84 additions & 1 deletion unified-runtime/source/adapters/native_cpu/threadpool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,90 @@ template <typename ThreadPoolT> class threadpool_interface {
return ret;
}
};
using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;

using threadpool_t = threadpool_interface<detail::simple_thread_pool>;
class TasksInfo_TP {
using FType = std::future<void>;
std::vector<FType> futures;

public:
void schedule(FType &&f) { futures.emplace_back(std::move(f)); }
void wait_all() {
for (auto &f : futures)
f.wait();
}
TasksInfo_TP(simple_threadpool_t &) {}
};

template <class TP, class TaskInfo> struct Scheduler_base {
TP &ref;
TaskInfo ti;
Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {}
TaskInfo getTaskInfo() { return std::move(ti); }
static constexpr bool CanWaitInThread() { return true; }
};

template <class TP> struct Scheduler : Scheduler_base<TP, TasksInfo_TP> {
using Scheduler_base<TP, TasksInfo_TP>::Scheduler_base;

template <class T> void schedule(T &&task) {
this->ti.schedule(this->ref.schedule_task(std::forward<T>(task)));
}
};

template <class TPType> inline Scheduler<TPType> getScheduler(TPType &tp) {
return Scheduler<TPType>(tp);
}

} // namespace native_cpu

#ifdef NATIVECPU_WITH_ONETBB
// Simple TBB backend
#include "oneapi/tbb.h"
namespace native_cpu {

class TBB_threadpool {
oneapi::tbb::task_group tasks;

public:
void wait_all() { tasks.wait(); }
oneapi::tbb::task_group &Tasks() { return tasks; }
size_t num_threads() const noexcept {
return oneapi::tbb::info::default_concurrency();
}
};

class TBB_TasksInfo {
TBB_threadpool *tp;

public:
void wait_all() { tp->wait_all(); }
TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {}
};

template <>
struct Scheduler<TBB_threadpool>
: Scheduler_base<TBB_threadpool, TBB_TasksInfo> {
using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base;
template <class T> void schedule(T &&task_) {
ref.Tasks().run([task = std::move(task_)]() {
auto thread_id = tbb::this_task_arena::current_thread_index();
assert(thread_id >= 0 &&
thread_id < oneapi::tbb::info::default_concurrency());
task(thread_id);
});
}
static constexpr bool CanWaitInThread() { return false; }
};

using tasksinfo_t = TBB_TasksInfo;
using threadpool_t = TBB_threadpool;
} // namespace native_cpu

#else
// The default backend
namespace native_cpu {
using tasksinfo_t = TasksInfo_TP;
using threadpool_t = simple_threadpool_t;
} // namespace native_cpu
#endif