intel
diff --git a/‎.github/workflows/sycl-ur-perf-benchmarking.yml‎
Lines changed: 12 additions & 0 deletions b/‎.github/workflows/sycl-ur-perf-benchmarking.yml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎.github/workflows/ur-benchmarks-reusable.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/ur-benchmarks-reusable.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/ur-benchmarks.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/ur-benchmarks.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sycl/doc/GetStartedGuide.md‎
Lines changed: 8 additions & 9 deletions b/‎sycl/doc/GetStartedGuide.md‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎sycl/doc/design/SYCL2020-SpecializationConstants.md‎
Lines changed: 15 additions & 15 deletions b/‎sycl/doc/design/SYCL2020-SpecializationConstants.md‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎sycl/include/sycl/detail/spinlock.hpp‎
Lines changed: 2 additions & 0 deletions b/‎sycl/include/sycl/detail/spinlock.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sycl/source/detail/kernel_program_cache.hpp‎
Lines changed: 9 additions & 5 deletions b/‎sycl/source/detail/kernel_program_cache.hpp‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎unified-runtime/source/adapters/cuda/command_buffer.cpp‎
Lines changed: 2 additions & 2 deletions b/‎unified-runtime/source/adapters/cuda/command_buffer.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎unified-runtime/source/adapters/cuda/command_buffer.hpp‎
Lines changed: 0 additions & 17 deletions b/‎unified-runtime/source/adapters/cuda/command_buffer.hpp‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎unified-runtime/source/adapters/cuda/common.cpp‎
Lines changed: 0 additions & 5 deletions b/‎unified-runtime/source/adapters/cuda/common.cpp‎
Lines changed: 0 additions & 5 deletions
@@ -0,0 +1,12 @@
+name: Benchmarks
+
+# This workflow is a WIP: this workflow file acts as a placeholder.
+
+on: [ workflow_dispatch ]
+
+jobs:
+  do-nothing:
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo 'This workflow is a WIP.'
+
@@ -1,6 +1,8 @@
 name: Benchmarks Reusable
 
 # This workflow is a WIP: This workflow file acts as a placeholder.
+#
+# This workflow is set to be merged into benchmark.yml
 
 on: [ workflow_call ]
 
 
@@ -1,6 +1,8 @@
 name: Benchmarks
 
 # This workflow is a WIP: this workflow file acts as a placeholder.
+#
+# This workflow is set to be merged into benchmark.yml
 
 on: [ workflow_dispatch ]
 
 
@@ -39,15 +39,14 @@ and a wide range of compute accelerators such as GPU and FPGA.
 
 ## Prerequisites
 
-* `git` - [Download](https://git-scm.com/downloads)
-* `cmake` version 3.20 or later - [Download](http://www.cmake.org/download/)
-* `python` - [Download](https://www.python.org/downloads/)
-* `ninja` -
-[Download](https://github.com/ninja-build/ninja/wiki/Pre-built-Ninja-packages)
-* `hwloc` version 2.3 or later (Linux only)
-  * libhwloc-dev or hwloc-devel package on linux
-* C++ compiler
-  * See LLVM's [host compiler toolchain requirements](https://github.com/intel/llvm/blob/sycl/llvm/docs/GettingStarted.rst#host-c-toolchain-both-compiler-and-standard-library)
+| Software                                                                    | Version                                                                                                                              |
+| ---                                                                         | ---                                                                                                                                  |
+| [Git](https://git-scm.com/downloads)                                        |                                                                                                                                      |
+| [CMake](http://www.cmake.org/download/)                                     | [See LLVM](https://github.com/intel/llvm/blob/sycl/llvm/docs/GettingStarted.rst#software)                                            |
+| [Python](https://www.python.org/downloads/)                                 | [See LLVM](https://github.com/intel/llvm/blob/sycl/llvm/docs/GettingStarted.rst#software)                                            |
+| [Ninja](https://github.com/ninja-build/ninja/wiki/Pre-built-Ninja-packages) |                                                                                                                                      |
+| `hwloc`                                                                     | >= 2.3 (Linux only, `libhwloc-dev` or `hwloc-devel`)                                                                                 |
+| C++ compiler                                                                | [See LLVM](https://github.com/intel/llvm/blob/sycl/llvm/docs/GettingStarted.rst#host-c-toolchain-both-compiler-and-standard-library) |
 
 Alternatively, you can create a Docker image that has everything you need for
 building pre-installed using the [Ubuntu 24.04 build Dockerfile](https://github.com/intel/llvm/blob/sycl/devops/containers/ubuntu2404_build.Dockerfile).
 
@@ -80,7 +80,7 @@ as:
 [sycl-2020-spec-constant-glossary]: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#specialization-constant
 [sycl-2020-glossary]: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#glossary
 
-And implementation is based on [SPIR-V speficiation][spirv-spec] support
+And implementation is based on [SPIR-V specification][spirv-spec] support
 for [Specialization][spirv-specialization]. However, the specification also
 states the following:
 
@@ -172,7 +172,7 @@ Based on those limitations, the following mapping design is proposed:
   ```
   namespace detail {
     // assuming user defined the following specialization_id:
-    // constexpr specialiation_id<int> int_const;
+    // constexpr specialization_id<int> int_const;
     // class Wrapper {
     // public:
     //   static constexpr specialization_id<float> float_const;
@@ -341,7 +341,7 @@ used to identify the specialization constants at SPIR-V level.
 As noted above one symbolic ID can have several numeric IDs assigned to it -
 such 1:N mapping comes from the fact that at SPIR-V level, composite
 specialization constants don't have dedicated IDs and they are being identified
-and specialized through their scalar leafs and corresponding numeric IDs.
+and specialized through their scalar leaves and corresponding numeric IDs.
 
 For example, the following code:
 ```
@@ -375,7 +375,7 @@ unique_symbolic_id_for_id_A -> { 1, 2, 3 }
 
 As it is shown in the example above, if a composite specialization constant
 contains another composite within it, that nested composite is also being
-"flattened" and its leafs are considered to be leafs of the parent
+"flattened" and its leaves are considered to be leaves of the parent
 specialization constants. This done by depth-first search through the composite
 elements.
 
@@ -509,8 +509,8 @@ constant in that buffer:
 ```
 [
   0, // for id_int, the first constant is at the beginning of the buffer
-  4, // sizeof(int) == 4, the second constant is located right after the fisrt one
-  16, // sizeof(int) + sizezof(A) == 4, the same approach for the third constant
+  4, // sizeof(int) == 4, the second constant is located right after the first one
+  16, // sizeof(int) + sizeof(A) == 4, the same approach for the third constant
 ]
 ```
 
@@ -661,9 +661,9 @@ While transforming SYCL kernel function into an OpenCL kernel, DPC++ FE should
 - Communicate to DPC++ RT which kernel argument should be used for passing
   a buffer with specialization constant values when they are emulated.
 
-DPC++ FE provides implementation of `__builtin_sycl_unique_id` built-in function and
-it also populates special integration footer with the content required by DPC++
-RT for access to right device image properties describing specialization
+DPC++ FE provides implementation of `__builtin_sycl_unique_id` built-in function
+and it also populates special integration footer with the content required by
+DPC++ RT for access to right device image properties describing specialization
 constants.
 
 #### SYCL Kernel function transformations
@@ -763,7 +763,7 @@ struct A {
 };
 
 constexpr specialization_id<int> id_int;
-struct Wraper {
+struct Wrapper {
 public:
   static constexpr specialization_id<A> id_A;
 };
@@ -839,10 +839,10 @@ constexpr sycl::specialization_id<int> same_name{1};
 
 namespace {
   constexpr sycl::specialization_id<int> same_name{2}:
-  /* application code that referenes ::(unnamed)::same_name */
+  /* application code that references ::(unnamed)::same_name */
   namespace {
     constexpr sycl::specialization_id<int> same_name{3}:
-    /* application code that referenes ::(unnamed)::(unnamed)::same_name */
+    /* application code that references ::(unnamed)::(unnamed)::same_name */
   }
 }
 
@@ -899,7 +899,7 @@ namespace {
 
   namespace __sycl_detail {
     // Sometimes we need a 'shim', which points to another 'shim' in order to
-    // "extract" a variable from an anonymous namespace unambiguosly
+    // "extract" a variable from an anonymous namespace unambiguously
     static constexpr decltype(__sycl_detail::__shim_1()) &__shim_2() {
       // still address of ::(unnamed)::(unnamed)::same_name;
       return __sycl_detail::__shim_1();
@@ -972,7 +972,7 @@ address of the specialization constant provided by user and `offset` field of
 the descriptor as `(char*)(SpecConstantValuesMap[SymbolicID]) + offset`.
 
 That calculation is required, because at SPIR-V level composite
-specialization constants are respresented by several specialization constants
+specialization constants are represented by several specialization constants
 for each element of a composite, whilst on a SYCL level, the whole composite
 is passed by user as a single blob of data. `offset` field from properties is
 used to specify which exact piece of that blob should be extracted to perform
@@ -1053,7 +1053,7 @@ the translator will generate `OpSpecConstant` SPIR-V instructions with proper
               OpDecorate %A.float SpecId 44                        ; ID of the 2nd member
      %A.i32 = OpSpecConstant %int.type 0                           ; 1st member with default value
    %A.float = OpSpecConstant %float.type 0.0                       ; 2nd member with default value
-    %struct = OpSpecConstantComposite %struct.type %A.i32 %A.float ; Composite doens't need IDs or default value
+    %struct = OpSpecConstantComposite %struct.type %A.i32 %A.float ; Composite doesn't need IDs or default value
          %1 = OpTypeFunction %int
 
        %get = OpFunction %int None %1
 
@@ -26,6 +26,8 @@ namespace detail {
 /// std::mutex, that doesn't provide such guarantees).
 class SpinLock {
 public:
+  bool try_lock() { return !MLock.test_and_set(std::memory_order_acquire); }
+
   void lock() {
     while (MLock.test_and_set(std::memory_order_acquire))
       std::this_thread::yield();
 
@@ -15,6 +15,7 @@
 #include <sycl/detail/common.hpp>
 #include <sycl/detail/locked.hpp>
 #include <sycl/detail/os_util.hpp>
+#include <sycl/detail/spinlock.hpp>
 #include <sycl/detail/ur.hpp>
 #include <sycl/detail/util.hpp>
 
@@ -421,7 +422,7 @@ class KernelProgramCache {
 
   template <typename KeyT>
   KernelFastCacheValT tryToGetKernelFast(KeyT &&CacheKey) {
-    std::unique_lock<std::mutex> Lock(MKernelFastCacheMutex);
+    KernelFastCacheReadLockT Lock(MKernelFastCacheMutex);
     auto It = MKernelFastCache.find(CacheKey);
     if (It != MKernelFastCache.end()) {
       traceKernel("Kernel fetched.", CacheKey.second, true);
@@ -445,7 +446,7 @@ class KernelProgramCache {
         return;
     }
     // Save reference between the program and the fast cache key.
-    std::unique_lock<std::mutex> Lock(MKernelFastCacheMutex);
+    KernelFastCacheWriteLockT Lock(MKernelFastCacheMutex);
     MProgramToKernelFastCacheKeyMap[Program].emplace_back(CacheKey);
 
     // if no insertion took place, thus some other thread has already inserted
@@ -483,7 +484,7 @@ class KernelProgramCache {
 
       {
         // Remove corresponding entries from KernelFastCache.
-        std::unique_lock<std::mutex> Lock(MKernelFastCacheMutex);
+        KernelFastCacheWriteLockT Lock(MKernelFastCacheMutex);
         if (auto FastCacheKeyItr =
                 MProgramToKernelFastCacheKeyMap.find(NativePrg);
             FastCacheKeyItr != MProgramToKernelFastCacheKeyMap.end()) {
@@ -630,7 +631,7 @@ class KernelProgramCache {
     std::lock_guard<std::mutex> EvictionListLock(MProgramEvictionListMutex);
     std::lock_guard<std::mutex> L1(MProgramCacheMutex);
     std::lock_guard<std::mutex> L2(MKernelsPerProgramCacheMutex);
-    std::lock_guard<std::mutex> L3(MKernelFastCacheMutex);
+    KernelFastCacheWriteLockT L3(MKernelFastCacheMutex);
     MCachedPrograms = ProgramCache{};
     MKernelsPerProgramCache = KernelCacheT{};
     MKernelFastCache = KernelFastCacheT{};
@@ -758,7 +759,10 @@ class KernelProgramCache {
   KernelCacheT MKernelsPerProgramCache;
   ContextPtr MParentContext;
 
-  std::mutex MKernelFastCacheMutex;
+  using KernelFastCacheMutexT = SpinLock;
+  using KernelFastCacheReadLockT = std::lock_guard<KernelFastCacheMutexT>;
+  using KernelFastCacheWriteLockT = std::lock_guard<KernelFastCacheMutexT>;
+  KernelFastCacheMutexT MKernelFastCacheMutex;
   KernelFastCacheT MKernelFastCache;
 
   // Map between fast kernel cache keys and program handle.
 
@@ -68,10 +68,10 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
 /// all the memory objects allocated for command_buffer managment
 ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
   // Release the memory allocated to the Context stored in the command_buffer
-  UR_TRACE(urContextRelease(Context));
+  UR_CALL_NOCHECK(urContextRelease(Context));
 
   // Release the device
-  UR_TRACE(urDeviceRelease(Device));
+  UR_CALL_NOCHECK(urDeviceRelease(Device));
 }
 
 // This may throw so it must be called from within a try...catch
 
@@ -18,23 +18,6 @@
 #include <memory>
 #include <unordered_set>
 
-// Trace an internal UR call
-#define UR_TRACE(Call)                                                         \
-  {                                                                            \
-    ur_result_t Result;                                                        \
-    UR_CALL(Call, Result);                                                     \
-  }
-
-// Trace an internal UR call and return the result to the user.
-#define UR_CALL(Call, Result)                                                  \
-  {                                                                            \
-    if (PrintTrace)                                                            \
-      logger::always("UR ---> {}", #Call);                                     \
-    Result = (Call);                                                           \
-    if (PrintTrace)                                                            \
-      logger::always("UR <--- {}({})", #Call, Result);                         \
-  }
-
 enum class CommandType {
   Kernel,
   USMMemcpy,
 
@@ -92,11 +92,6 @@ std::string getCudaVersionString() {
   return stream.str();
 }
 
-void detail::ur::die(const char *Message) {
-  logger::always("ur_die:{}", Message);
-  std::terminate();
-}
-
 void detail::ur::assertion(bool Condition, const char *Message) {
   if (!Condition)
     die(Message);