Merge branch 'sycl' into msan-always-reserve-origin-mem

zhaomaosu · zhaomaosu · commit 436995804969 · 2025-09-02T15:03:41.000+02:00
diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
@@ -62,6 +62,20 @@ in order to use a local checkout of the oneAPI Construction Kit. The CMake varia
 
 The SYCL Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. 
 
+### oneTBB integration
+
+SYCL Native CPU can use oneTBB as an optional backend for task scheduling. oneTBB with SYCL Native CPU is enabled by setting `NATIVECPU_WITH_ONETBB=On` at configure time:
+
+```
+python3 buildbot/configure.py \
+  --native_cpu \
+  --cmake-opt=-DNATIVECPU_WITH_ONETBB=On
+```
+
+This will pull oneTBB into SYCL Native CPU via CMake `FetchContent` and DPC++ can be built as usual.
+
+By default SYCL Native CPU implements its own scheduler whose only dependency is standard C++.
+
 # Supported features and current limitations
 
 The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barriers` are supported only when the oneAPI Construction Kit integration is enabled, several math builtins are not supported and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](https://github.com/intel/llvm/blob/sycl/sycl/test/native_cpu).
diff --git a/unified-runtime/source/adapters/native_cpu/CMakeLists.txt b/unified-runtime/source/adapters/native_cpu/CMakeLists.txt
@@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME}
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
@@ -51,6 +52,34 @@ set_target_properties(${TARGET_NAME} PROPERTIES
         SOVERSION "${PROJECT_VERSION_MAJOR}"
 )
 
+# oneTBB is used as an optional NativeCPU backend and disabled by default.
+option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF)
+if(NATIVECPU_WITH_ONETBB)
+  message(STATUS "Configuring Native CPU adapter with oneTBB backend.")
+
+  include(FetchContent)
+  FetchContent_Declare(
+    tbb
+    GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git
+# commit 4e4fffed4fb86ae0960a3364700f549b539c777e (HEAD -> master, origin/master, origin/HEAD)
+# Author: Ilya Isaev <ilya.isaev@intel.com>
+# Date:   Mon Aug 18 10:35:26 2025 +0200
+# Improve task_arena interoperability with task_groups (#1784)
+    GIT_TAG 4e4fffed4fb86ae0960a3364700f549b539c777e
+    CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
+    OVERRIDE_FIND_PACKAGE
+  )
+  set(TBB_TEST OFF CACHE INTERNAL "" FORCE)
+  set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE)
+  set(TBB_BENCH OFF CACHE INTERNAL "" FORCE)
+  set(TBB_BUILD ON CACHE INTERNAL "" FORCE)
+  set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE)
+  set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE)
+  set(TBB_INSTALL ON CACHE INTERNAL "" FORCE)
+  set (CMAKE_INCLUDE_CURRENT_DIR OFF)
+  FetchContent_MakeAvailable(tbb)
+endif()
+
 find_package(Threads REQUIRED)
 
 target_link_libraries(${TARGET_NAME} PRIVATE
@@ -63,3 +92,23 @@ target_link_libraries(${TARGET_NAME} PRIVATE
 target_include_directories(${TARGET_NAME} PRIVATE
         "${CMAKE_CURRENT_SOURCE_DIR}/../../"
 )
+
+if(NATIVECPU_WITH_ONETBB)
+  target_link_libraries(${TARGET_NAME} PRIVATE
+        TBB::tbb
+  )
+  if (NOT MSVC)
+    # oneTBB currently casts away some const qualifiers
+    # todo: check if compiler actually supports these options
+    target_compile_options(tbb PRIVATE -Wno-cast-qual -Wno-stringop-overflow -Wno-unknown-warning-option)
+    target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
+  endif()
+
+  # Undefine _DEBUG option in release builds to find
+  # release tbbbind
+  if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
+    target_compile_options(tbb PRIVATE -U_DEBUG)
+  endif()
+
+  target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB)
+endif()
diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp
@@ -70,8 +70,18 @@ class WaitInfo {
   }
 };
 
+template <class T>
 inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList,
-                                   const ur_event_handle_t *phEventWaitList) {
+                                   const ur_event_handle_t *phEventWaitList,
+                                   const T &scheduler) {
+  if (numEventsInWaitList && !scheduler.CanWaitInThread()) {
+    // Waiting for dependent events in threads launched by the enqueue may
+    // not work correctly for some backend/schedulers, so we have the safe
+    // option here to wait in the main thread instead (potentially at the
+    // expense of performance).
+    urEventWait(numEventsInWaitList, phEventWaitList);
+    numEventsInWaitList = 0;
+  }
   return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList);
 }
 
@@ -151,7 +161,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   auto &tp = hQueue->getDevice()->tp;
   const size_t numParallelThreads = tp.num_threads();
-  std::vector<std::future<void>> futures;
+  auto Tasks = native_cpu::getScheduler(tp);
   auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
   auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
   auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
@@ -162,7 +172,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
   kernel->updateMemPool(numParallelThreads);
 
-  auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
+  auto InEvents =
+      native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);
 
   const size_t numWG = numWG0 * numWG1 * numWG2;
   const size_t numWGPerThread = numWG / numParallelThreads;
@@ -177,42 +188,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     rangeEnd[0] = rangeEnd[3] % numWG0;
     rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1;
     rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1);
-    futures.emplace_back(tp.schedule_task(
-        [ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3],
-         numWG0, numWG1, numParallelThreads](size_t threadId) {
-          auto state = getState(ndr);
-          InEvents.wait();
-          for (size_t g0 = rangeStart[0], g1 = rangeStart[1],
-                      g2 = rangeStart[2], g3 = rangeStart[3];
-               g3 < rangeEnd; ++g3) {
+    Tasks.schedule([ndr, InEvents, &kernel = *kernel, rangeStart,
+                    rangeEnd = rangeEnd[3], numWG0, numWG1,
+                    numParallelThreads](size_t threadId) {
+      auto state = getState(ndr);
+      InEvents.wait();
+      for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2],
+                  g3 = rangeStart[3];
+           g3 < rangeEnd; ++g3) {
 #ifdef NATIVECPU_USE_OCK
-            state.update(g0, g1, g2);
-            kernel._subhandler(
-                kernel.getArgs(numParallelThreads, threadId).data(), &state);
+        state.update(g0, g1, g2);
+        kernel._subhandler(kernel.getArgs(numParallelThreads, threadId).data(),
+                           &state);
 #else
-            for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
-              for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
-                for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
-                  state.update(g0, g1, g2, local0, local1, local2);
-                  kernel._subhandler(
-                      kernel.getArgs(numParallelThreads, threadId).data(),
-                      &state);
-                }
-              }
+        for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
+          for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
+            for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
+              state.update(g0, g1, g2, local0, local1, local2);
+              kernel._subhandler(
+                  kernel.getArgs(numParallelThreads, threadId).data(), &state);
             }
+          }
+        }
 #endif
-            if (++g0 == numWG0) {
-              g0 = 0;
-              if (++g1 == numWG1) {
-                g1 = 0;
-                ++g2;
-              }
-            }
+        if (++g0 == numWG0) {
+          g0 = 0;
+          if (++g1 == numWG1) {
+            g1 = 0;
+            ++g2;
           }
-        }));
+        }
+      }
+    });
     rangeStart = rangeEnd;
   }
-  event->set_futures(futures);
+  event->set_tasksinfo(Tasks.getMovedTaskInfo());
 
   if (phEvent) {
     *phEvent = event;
@@ -248,14 +258,14 @@ withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
       return result;
     }
     auto &tp = hQueue->getDevice()->tp;
-    std::vector<std::future<void>> futures;
+    auto Tasks = native_cpu::getScheduler(tp);
     auto InEvents =
-        native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
-    futures.emplace_back(tp.schedule_task([f, InEvents](size_t) {
+        native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList, Tasks);
+    Tasks.schedule([f, InEvents](size_t) {
       InEvents.wait();
       f();
-    }));
-    event->set_futures(futures);
+    });
+    event->set_tasksinfo(Tasks.getMovedTaskInfo());
     event->set_callback(
         [event, InEvents = InEvents.getUniquePtr()]() { event->tick_end(); });
     return UR_RESULT_SUCCESS;
@@ -465,7 +475,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
         // TODO: error checking
         // TODO: handle async
         void *startingPtr = hBuffer->_mem + offset;
-        unsigned steps = size / patternSize;
+        size_t steps = size / patternSize;
         for (unsigned i = 0; i < steps; i++) {
           memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
                  patternSize);
@@ -575,7 +585,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
           break;
         }
         default: {
-          for (unsigned int step{0}; step < size; step += patternSize) {
+          for (size_t step{0}; step < size; step += patternSize) {
             auto *dest = reinterpret_cast<void *>(
                 reinterpret_cast<uint8_t *>(ptr) + step);
             memcpy(dest, pPattern, patternSize);
diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp
@@ -11,6 +11,7 @@
 #include "ur_api.h"
 
 #include "common.hpp"
+#include "device.hpp"
 #include "event.hpp"
 #include "queue.hpp"
 #include <cstdint>
@@ -111,7 +112,7 @@ urEnqueueTimestampRecordingExp(ur_queue_handle_t /*hQueue*/, bool /*blocking*/,
 ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue,
                                        ur_command_t command_type)
     : queue(queue), context(queue->getContext()), command_type(command_type),
-      done(false) {
+      done(false), tasksinfo(queue->getDevice()->tp) {
   this->queue->addEvent(this);
 }
 
@@ -126,9 +127,7 @@ void ur_event_handle_t_::wait() {
   if (done) {
     return;
   }
-  for (auto &f : futures) {
-    f.wait();
-  }
+  this->tasksinfo.wait_all();
   queue->removeEvent(this);
   done = true;
   // The callback may need to acquire the lock, so we unlock it here
diff --git a/unified-runtime/source/adapters/native_cpu/event.hpp b/unified-runtime/source/adapters/native_cpu/event.hpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 #include "common.hpp"
+#include "threadpool.hpp"
 #include "ur_api.h"
 #include <cstdint>
 #include <future>
@@ -42,9 +43,9 @@ struct ur_event_handle_t_ : RefCounted {
 
   ur_command_t getCommandType() const { return command_type; }
 
-  void set_futures(std::vector<std::future<void>> &fs) {
+  void set_tasksinfo(native_cpu::tasksinfo_t &&fs) {
     std::lock_guard<std::mutex> lock(mutex);
-    futures = std::move(fs);
+    tasksinfo = std::move(fs);
   }
 
   void tick_start();
@@ -61,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted {
   ur_command_t command_type;
   bool done;
   std::mutex mutex;
-  std::vector<std::future<void>> futures;
+  native_cpu::tasksinfo_t tasksinfo;
   std::packaged_task<void()> callback;
   uint64_t timestamp_start = 0;
   uint64_t timestamp_end = 0;
diff --git a/unified-runtime/source/adapters/native_cpu/threadpool.hpp b/unified-runtime/source/adapters/native_cpu/threadpool.hpp
@@ -207,7 +207,90 @@ template <typename ThreadPoolT> class threadpool_interface {
     return ret;
   }
 };
+using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;
 
-using threadpool_t = threadpool_interface<detail::simple_thread_pool>;
+class TasksInfo_TP {
+  using FType = std::future<void>;
+  std::vector<FType> futures;
 
+public:
+  void schedule(FType &&f) { futures.emplace_back(std::move(f)); }
+  void wait_all() {
+    for (auto &f : futures)
+      f.wait();
+  }
+  TasksInfo_TP(simple_threadpool_t &) {}
+};
+
+template <class TP, class TaskInfo> struct Scheduler_base {
+  TP &ref;
+  TaskInfo ti;
+  Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {}
+  TaskInfo getMovedTaskInfo() { return std::move(ti); }
+  static constexpr bool CanWaitInThread() { return true; }
+};
+
+template <class TP> struct Scheduler : Scheduler_base<TP, TasksInfo_TP> {
+  using Scheduler_base<TP, TasksInfo_TP>::Scheduler_base;
+
+  template <class T> void schedule(T &&task) {
+    this->ti.schedule(this->ref.schedule_task(std::forward<T>(task)));
+  }
+};
+
+template <class TPType> inline Scheduler<TPType> getScheduler(TPType &tp) {
+  return Scheduler<TPType>(tp);
+}
+
+} // namespace native_cpu
+
+#ifdef NATIVECPU_WITH_ONETBB
+// Simple TBB backend
+#include "oneapi/tbb.h"
+namespace native_cpu {
+
+class TBB_threadpool {
+  oneapi::tbb::task_group tasks;
+
+public:
+  void wait_all() { tasks.wait(); }
+  oneapi::tbb::task_group &Tasks() { return tasks; }
+  size_t num_threads() const noexcept {
+    return oneapi::tbb::info::default_concurrency();
+  }
+};
+
+class TBB_TasksInfo {
+  TBB_threadpool *tp;
+
+public:
+  void wait_all() { tp->wait_all(); }
+  TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {}
+};
+
+template <>
+struct Scheduler<TBB_threadpool>
+    : Scheduler_base<TBB_threadpool, TBB_TasksInfo> {
+  using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base;
+  template <class T> void schedule(T &&task_) {
+    ref.Tasks().run([task = std::move(task_)]() {
+      auto thread_id = tbb::this_task_arena::current_thread_index();
+      assert(thread_id >= 0 &&
+             thread_id < oneapi::tbb::info::default_concurrency());
+      task(thread_id);
+    });
+  }
+  static constexpr bool CanWaitInThread() { return false; }
+};
+
+using tasksinfo_t = TBB_TasksInfo;
+using threadpool_t = TBB_threadpool;
+} // namespace native_cpu
+
+#else
+// The default backend
+namespace native_cpu {
+using tasksinfo_t = TasksInfo_TP;
+using threadpool_t = simple_threadpool_t;
 } // namespace native_cpu
+#endif