intel
diff --git a/‎.github/workflows/sycl-linux-run-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/sycl-linux-run-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/lib/Driver/ToolChains/Cuda.cpp‎
Lines changed: 9 additions & 0 deletions b/‎clang/lib/Driver/ToolChains/Cuda.cpp‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp‎
Lines changed: 2 additions & 0 deletions b/‎clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎clang/test/Driver/sycl-nvptx-fast-math.cpp‎
Lines changed: 18 additions & 0 deletions b/‎clang/test/Driver/sycl-nvptx-fast-math.cpp‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎libdevice/nativecpu_utils.cpp‎
Lines changed: 11 additions & 20 deletions b/‎libdevice/nativecpu_utils.cpp‎
Lines changed: 11 additions & 20 deletions
diff --git a/‎llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td‎
Lines changed: 3 additions & 1 deletion b/‎llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp‎
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sycl/ReleaseNotes.md‎
Lines changed: 4 additions & 4 deletions b/‎sycl/ReleaseNotes.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎sycl/cmake/modules/FetchUnifiedRuntime.cmake‎
Lines changed: 8 additions & 7 deletions b/‎sycl/cmake/modules/FetchUnifiedRuntime.cmake‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎sycl/doc/design/DeviceIf.md‎
Lines changed: 15 additions & 15 deletions b/‎sycl/doc/design/DeviceIf.md‎
Lines changed: 15 additions & 15 deletions
@@ -151,7 +151,7 @@ permissions:
 
 jobs:
   run:
-    if: inputs.skip_run == 'false'
+    if: github.event_name == 'workflow_dispatch' || inputs.skip_run == 'false'
     name: ${{ inputs.name }}
     runs-on: ${{ fromJSON(inputs.runner) }}
     container:
 
@@ -946,6 +946,15 @@ void CudaToolChain::addClangTargetOptions(
 
     if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt))
       CC1Args.push_back("-fcuda-prec-sqrt");
+
+    bool FastRelaxedMath = DriverArgs.hasFlag(
+        options::OPT_ffast_math, options::OPT_fno_fast_math, false);
+    bool UnsafeMathOpt =
+        DriverArgs.hasFlag(options::OPT_funsafe_math_optimizations,
+                           options::OPT_fno_unsafe_math_optimizations, false);
+    if (FastRelaxedMath || UnsafeMathOpt)
+      CC1Args.append({"-mllvm", "--nvptx-prec-divf32=0", "-mllvm",
+                      "--nvptx-prec-sqrtf32=0"});
   } else {
     CC1Args.append(
         {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
 
@@ -50,8 +50,10 @@
 // RUN:   FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE
 // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx940 \
 // RUN:   -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE
 // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx941 \
 // RUN:   -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE
 // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx942 \
 // RUN:   -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE
 
@@ -0,0 +1,18 @@
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang -### -nocudalib \
+// RUN:   -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s 2>&1 \
+// RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s
+
+// RUN: %clang -### -nocudalib \
+// RUN:   -fsycl -fsycl-targets=nvptx64-nvidia-cuda -ffast-math %s 2>&1 \
+// RUN: | FileCheck --check-prefix=CHECK-FAST %s
+
+// RUN: %clang -### -nocudalib \
+// RUN:   -fsycl -fsycl-targets=nvptx64-nvidia-cuda -funsafe-math-optimizations %s 2>&1 \
+// RUN: | FileCheck --check-prefix=CHECK-FAST %s
+
+// CHECK-FAST: "-mllvm" "--nvptx-prec-divf32=0" "-mllvm" "--nvptx-prec-sqrtf32=0"
+
+// CHECK-DEFAULT-NOT: "nvptx-prec-divf32=0"
+// CHECK-DEFAULT-NOT: "nvptx-prec-sqrtf32=0"
@@ -65,13 +65,13 @@ __spirv_MemoryBarrier(uint32_t Memory, uint32_t Semantics) {
   template <>                                                                  \
   __SYCL_CONVERGENT__ DEVICE_EXTERNAL Type                                     \
   __spirv_SubgroupBlockReadINTEL<Type>(const OCL_GLOBAL PType *Ptr) noexcept { \
-    return *Ptr;                                                               \
+    return Ptr[__spirv_SubgroupLocalInvocationId()];                           \
   }                                                                            \
   template <>                                                                  \
   __SYCL_CONVERGENT__ DEVICE_EXTERNAL void                                     \
   __spirv_SubgroupBlockWriteINTEL<Type>(PType OCL_GLOBAL * ptr,                \
                                         Type v) noexcept {                     \
-    *(Type *)ptr = v;                                                          \
+    ((Type*)ptr)[__spirv_SubgroupLocalInvocationId()]  = v;                    \
   }
 
 #define DefSubgroupBlockINTEL_vt(Type, VT_name)                                \
@@ -92,16 +92,19 @@ template <class T> struct vtypes {
 DefSubgroupBlockINTEL(uint32_t) DefSubgroupBlockINTEL(uint64_t)
 DefSubgroupBlockINTEL(uint8_t) DefSubgroupBlockINTEL(uint16_t)
 
-#define DefineGOp1(spir_sfx, mux_name)\
-DEVICE_EXTERN_C bool mux_name(bool);\
+#define DefineGOp1(spir_sfx, name)\
+DEVICE_EXTERN_C bool __mux_sub_group_##name##_i1(bool);\
+DEVICE_EXTERN_C bool __mux_work_group_##name##_i1(uint32_t id, bool val);\
 DEVICE_EXTERNAL bool __spirv_Group ## spir_sfx(unsigned g, bool val) {\
   if (__spv::Scope::Flag::Subgroup == g)\
-    return mux_name(val);\
+    return __mux_sub_group_##name##_i1(val);\
+  else if (__spv::Scope::Flag::Workgroup == g)\
+    return __mux_work_group_##name##_i1(0, val);\
   return false;\
 }
 
-DefineGOp1(Any, __mux_sub_group_any_i1)
-DefineGOp1(All, __mux_sub_group_all_i1)
+DefineGOp1(Any, any)
+DefineGOp1(All, all)
 
 
 #define DefineGOp(Type, MuxType, spir_sfx, mux_sfx)                            \
@@ -184,18 +187,6 @@ DefineBitwiseGroupOp(uint64_t, int64_t, i64)
 
 DefineLogicalGroupOp(bool, bool, i1)
 
-#define DefineBroadCastImpl(Type, Sfx, MuxType, IDType)                       \
-  DEVICE_EXTERN_C MuxType __mux_work_group_broadcast_##Sfx(                   \
-      int32_t id, MuxType val, int64_t lidx, int64_t lidy, int64_t lidz);     \
-  DEVICE_EXTERN_C MuxType __mux_sub_group_broadcast_##Sfx(MuxType val,        \
-                                                          int32_t sg_lid);    \
-  DEVICE_EXTERNAL Type __spirv_GroupBroadcast(uint32_t g, Type v,             \
-                                              IDType l) {                     \
-    if (__spv::Scope::Flag::Subgroup == g)                                    \
-      return __mux_sub_group_broadcast_##Sfx(v, l);                           \
-    return Type(); /*todo: add support for other flags as they are tested*/   \
-  }
-
 #define DefineBroadcastMuxType(Type, Sfx, MuxType, IDType)                    \
   DEVICE_EXTERN_C MuxType __mux_work_group_broadcast_##Sfx(                   \
       int32_t id, MuxType val, uint64_t lidx, uint64_t lidy, uint64_t lidz);  \
@@ -216,7 +207,7 @@ DefineLogicalGroupOp(bool, bool, i1)
     if (__spv::Scope::Flag::Subgroup == g)                                    \
       return __mux_sub_group_broadcast_##Sfx(v, l[0]);                        \
     else                                                                      \
-      return __mux_work_group_broadcast_##Sfx(0, v, l[0], l[0], 0);           \
+      return __mux_work_group_broadcast_##Sfx(0, v, l[0], l[1], 0);           \
   }                                                                           \
                                                                               \
   DEVICE_EXTERNAL Type __spirv_GroupBroadcast(uint32_t g, Type v,             \
 
@@ -85,6 +85,7 @@ def AspectExt_oneapi_virtual_mem : Aspect<"ext_oneapi_virtual_mem">;
 def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group">;
 def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">;
 def AspectExt_oneapi_atomic16 : Aspect<"ext_oneapi_atomic16">;
+def AspectExt_oneapi_virtual_functions : Aspect<"ext_oneapi_virtual_functions">;
 // Deprecated aspects
 def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
 def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">;
@@ -148,7 +149,8 @@ def : TargetInfo<"__TestAspectList",
     AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph, AspectExt_oneapi_private_alloca, 
     AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group, 
     AspectExt_intel_fpga_task_sequence,
-    AspectExt_oneapi_atomic16],
+    AspectExt_oneapi_atomic16,
+    AspectExt_oneapi_virtual_functions],
     []>;
 // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT
 // match.
 
@@ -37,6 +37,7 @@ getSYCLESIMDSplitStatusFromMetadata(const Module &M) {
   assert(MDOp && "Unexpected metadata operand");
   const auto &MDConst = MDOp->getOperand(0);
   auto *MDVal = mdconst::dyn_extract_or_null<ConstantInt>(MDConst);
+  assert(MDVal && "Unexpected metadata operand type");
   uint8_t Val = MDVal->getZExtValue();
   assert(Val < 3 && "Unexpected value for split metadata");
   auto AsEnum = static_cast<module_split::SyclEsimdSplitStatus>(Val);
 
@@ -39,7 +39,7 @@ Release notes for commit range
   when compiling NVPTX. intel/llvm#14621
 - Added support for `::rand` and `::srand` in device code on Intel devices. intel/llvm#13506
 - Added support for `sm90a` CUDA target architecture. intel/llvm#14075
-- Added support for detecting misaligned data accesses via address sanitizer. intell/llvm#14148
+- Added support for detecting misaligned data accesses via address sanitizer. intel/llvm#14148
 - Added support for emitting multiple error reports via address sanitizer
   through `-fsanitize-recover=address`. intel/llvm#13948
 - Added initial support for
@@ -175,7 +175,7 @@ Release notes for commit range
   Please note that this functionality relies on the compile knowing which
   targets support which optional kernel features and that database is not yet
   fully complete. In particular, data for Lunar Lake and Battlemage Intel GPUs
-  is still missing. intel/llvm#14590 intel/llvm#14188 intel/lvm#12727
+  is still missing. intel/llvm#14590 intel/llvm#14188 intel/llvm#12727
   intel/llvm#14757 intel/llvm#13486 intel/llvm#13974 intel/llvm#13617
 - Enhanced compiler to annotate SYCL kernel arguments passed by value with
   `__grid_constant__` for CUDA backend. intel/llvm#14322
@@ -273,7 +273,7 @@ Release notes for commit range
 - Relaxed diagnostic about using virtual functions in SYCL kernels: now it is
   only emitted if a call is perfomed using virtual call mechanism, but it is not
   emitted for non-virtual calls of virtual functions. See also
-  KhronosGroup/SYCL-Docs#565. intel/llvm#114051 intel/llvm#14141
+  KhronosGroup/SYCL-Docs#565. intel/llvm#14051 intel/llvm#14141
 - ESIMD API `inv` was extended to support `double` arguments. intel/llvm#13838
 - Enhanced validation (via `static_assert` mechanism) of template arguments of
   ESIMD `rdregion` and `wrregion` APIs. intel/llvm#13158
@@ -484,7 +484,7 @@ Release notes for commit range
   extension. intel/llvm#14522
 - Fixed a bug where defining kernel as a named functor whilst using
   `-fno-sycl-unnamed-lambda` would lead to a compilation error about unnamed
-  lambdas being unsupported. intel/lvm#14614
+  lambdas being unsupported. intel/llvm#14614
 - Fixed an issue on CUDA & AMDGPU backends where `multi_ptr` relational
   operators taking `std::nullptr_t` would produce different results to their
   corresponding standard C++ helpers like `std::less`. intel/llvm#13201
 
@@ -117,13 +117,14 @@ if(SYCL_UR_USE_FETCH_CONTENT)
   endfunction()
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit b79ebe4e98789144bcdf3832088eb6e6b5ae6967
-  # Merge: 7b4bc761 fbb6e862
-  # Author: Kenneth Benzie (Benie) <[email protected]>
-  # Date:   Fri Oct 4 16:39:59 2024 +0100
-  #     Merge pull request #2018 from wenju-he/L0-bindless-image-device-query
-  #     [L0] Fix device query of bindless image support
-  set(UNIFIED_RUNTIME_TAG b79ebe4e98789144bcdf3832088eb6e6b5ae6967)
+  # commit df6da35d6e67f2383db28dd49ab08c5c0ef541d2
+  # Merge: 67590533 55bd5636
+  # Author: aarongreig <[email protected]>
+  # Date:   Mon Oct 7 12:28:07 2024 +0100
+  #     Merge pull request #2038 from GeorgeWeb/georgi/unsupported-max-coop-wgsize
+  #     [UR][hip][opencl] Mark urKernelSuggestMaxCooperativeGroupCountExp as unsupported
+  #     instead of returning misleading default value
+  set(UNIFIED_RUNTIME_TAG df6da35d6e67f2383db28dd49ab08c5c0ef541d2)
 
   set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
   # Due to the use of dependentloadflag and no installer for UMF and hwloc we need
 
@@ -169,21 +169,21 @@ one of the following corresponding C++ macro names:
 * `__SYCL_TARGET_INTEL_GPU_ACM_G11__`
 * `__SYCL_TARGET_INTEL_GPU_ACM_G12__`
 * `__SYCL_TARGET_INTEL_GPU_PVC__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM50__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM52__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM53__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM60__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM61__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM62__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM70__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM72__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM75__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM80__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM86__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM87__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM89__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM90__`
-* `__SYCL_TARGET_NVIDIA_GPU_SM90A__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_50__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_52__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_53__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_60__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_61__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_62__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_70__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_72__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_75__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_80__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_86__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_87__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_89__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_90__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM_90A__`
 * `__SYCL_TARGET_AMD_GPU_GFX700__`
 * `__SYCL_TARGET_AMD_GPU_GFX701__`
 * `__SYCL_TARGET_AMD_GPU_GFX702__`