Skip to content

Commit 95eb97a

Browse files
committed
Merge branch 'sycl' into multiple-build
2 parents 8adf32c + 4797d65 commit 95eb97a

File tree

103 files changed

+1459
-278
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+1459
-278
lines changed

.github/workflows/sycl-linux-run-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ permissions:
151151

152152
jobs:
153153
run:
154-
if: inputs.skip_run == 'false'
154+
if: github.event_name == 'workflow_dispatch' || inputs.skip_run == 'false'
155155
name: ${{ inputs.name }}
156156
runs-on: ${{ fromJSON(inputs.runner) }}
157157
container:

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,15 @@ void CudaToolChain::addClangTargetOptions(
946946

947947
if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt))
948948
CC1Args.push_back("-fcuda-prec-sqrt");
949+
950+
bool FastRelaxedMath = DriverArgs.hasFlag(
951+
options::OPT_ffast_math, options::OPT_fno_fast_math, false);
952+
bool UnsafeMathOpt =
953+
DriverArgs.hasFlag(options::OPT_funsafe_math_optimizations,
954+
options::OPT_fno_unsafe_math_optimizations, false);
955+
if (FastRelaxedMath || UnsafeMathOpt)
956+
CC1Args.append({"-mllvm", "--nvptx-prec-divf32=0", "-mllvm",
957+
"--nvptx-prec-sqrtf32=0"});
949958
} else {
950959
CC1Args.append(
951960
{"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});

clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,10 @@
5050
// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE
5151
// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx940 \
5252
// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \
53+
// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE
5354
// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx941 \
5455
// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \
56+
// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE
5557
// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx942 \
5658
// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \
5759
// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// REQUIRES: nvptx-registered-target
2+
3+
// RUN: %clang -### -nocudalib \
4+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s 2>&1 \
5+
// RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s
6+
7+
// RUN: %clang -### -nocudalib \
8+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -ffast-math %s 2>&1 \
9+
// RUN: | FileCheck --check-prefix=CHECK-FAST %s
10+
11+
// RUN: %clang -### -nocudalib \
12+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -funsafe-math-optimizations %s 2>&1 \
13+
// RUN: | FileCheck --check-prefix=CHECK-FAST %s
14+
15+
// CHECK-FAST: "-mllvm" "--nvptx-prec-divf32=0" "-mllvm" "--nvptx-prec-sqrtf32=0"
16+
17+
// CHECK-DEFAULT-NOT: "nvptx-prec-divf32=0"
18+
// CHECK-DEFAULT-NOT: "nvptx-prec-sqrtf32=0"

libdevice/nativecpu_utils.cpp

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,13 @@ __spirv_MemoryBarrier(uint32_t Memory, uint32_t Semantics) {
6565
template <> \
6666
__SYCL_CONVERGENT__ DEVICE_EXTERNAL Type \
6767
__spirv_SubgroupBlockReadINTEL<Type>(const OCL_GLOBAL PType *Ptr) noexcept { \
68-
return *Ptr; \
68+
return Ptr[__spirv_SubgroupLocalInvocationId()]; \
6969
} \
7070
template <> \
7171
__SYCL_CONVERGENT__ DEVICE_EXTERNAL void \
7272
__spirv_SubgroupBlockWriteINTEL<Type>(PType OCL_GLOBAL * ptr, \
7373
Type v) noexcept { \
74-
*(Type *)ptr = v; \
74+
((Type*)ptr)[__spirv_SubgroupLocalInvocationId()] = v; \
7575
}
7676

7777
#define DefSubgroupBlockINTEL_vt(Type, VT_name) \
@@ -92,16 +92,19 @@ template <class T> struct vtypes {
9292
DefSubgroupBlockINTEL(uint32_t) DefSubgroupBlockINTEL(uint64_t)
9393
DefSubgroupBlockINTEL(uint8_t) DefSubgroupBlockINTEL(uint16_t)
9494

95-
#define DefineGOp1(spir_sfx, mux_name)\
96-
DEVICE_EXTERN_C bool mux_name(bool);\
95+
#define DefineGOp1(spir_sfx, name)\
96+
DEVICE_EXTERN_C bool __mux_sub_group_##name##_i1(bool);\
97+
DEVICE_EXTERN_C bool __mux_work_group_##name##_i1(uint32_t id, bool val);\
9798
DEVICE_EXTERNAL bool __spirv_Group ## spir_sfx(unsigned g, bool val) {\
9899
if (__spv::Scope::Flag::Subgroup == g)\
99-
return mux_name(val);\
100+
return __mux_sub_group_##name##_i1(val);\
101+
else if (__spv::Scope::Flag::Workgroup == g)\
102+
return __mux_work_group_##name##_i1(0, val);\
100103
return false;\
101104
}
102105

103-
DefineGOp1(Any, __mux_sub_group_any_i1)
104-
DefineGOp1(All, __mux_sub_group_all_i1)
106+
DefineGOp1(Any, any)
107+
DefineGOp1(All, all)
105108

106109

107110
#define DefineGOp(Type, MuxType, spir_sfx, mux_sfx) \
@@ -184,18 +187,6 @@ DefineBitwiseGroupOp(uint64_t, int64_t, i64)
184187

185188
DefineLogicalGroupOp(bool, bool, i1)
186189

187-
#define DefineBroadCastImpl(Type, Sfx, MuxType, IDType) \
188-
DEVICE_EXTERN_C MuxType __mux_work_group_broadcast_##Sfx( \
189-
int32_t id, MuxType val, int64_t lidx, int64_t lidy, int64_t lidz); \
190-
DEVICE_EXTERN_C MuxType __mux_sub_group_broadcast_##Sfx(MuxType val, \
191-
int32_t sg_lid); \
192-
DEVICE_EXTERNAL Type __spirv_GroupBroadcast(uint32_t g, Type v, \
193-
IDType l) { \
194-
if (__spv::Scope::Flag::Subgroup == g) \
195-
return __mux_sub_group_broadcast_##Sfx(v, l); \
196-
return Type(); /*todo: add support for other flags as they are tested*/ \
197-
}
198-
199190
#define DefineBroadcastMuxType(Type, Sfx, MuxType, IDType) \
200191
DEVICE_EXTERN_C MuxType __mux_work_group_broadcast_##Sfx( \
201192
int32_t id, MuxType val, uint64_t lidx, uint64_t lidy, uint64_t lidz); \
@@ -216,7 +207,7 @@ DefineLogicalGroupOp(bool, bool, i1)
216207
if (__spv::Scope::Flag::Subgroup == g) \
217208
return __mux_sub_group_broadcast_##Sfx(v, l[0]); \
218209
else \
219-
return __mux_work_group_broadcast_##Sfx(0, v, l[0], l[0], 0); \
210+
return __mux_work_group_broadcast_##Sfx(0, v, l[0], l[1], 0); \
220211
} \
221212
\
222213
DEVICE_EXTERNAL Type __spirv_GroupBroadcast(uint32_t g, Type v, \

llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ def AspectExt_oneapi_virtual_mem : Aspect<"ext_oneapi_virtual_mem">;
8585
def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group">;
8686
def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">;
8787
def AspectExt_oneapi_atomic16 : Aspect<"ext_oneapi_atomic16">;
88+
def AspectExt_oneapi_virtual_functions : Aspect<"ext_oneapi_virtual_functions">;
8889
// Deprecated aspects
8990
def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
9091
def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">;
@@ -148,7 +149,8 @@ def : TargetInfo<"__TestAspectList",
148149
AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph, AspectExt_oneapi_private_alloca,
149150
AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group,
150151
AspectExt_intel_fpga_task_sequence,
151-
AspectExt_oneapi_atomic16],
152+
AspectExt_oneapi_atomic16,
153+
AspectExt_oneapi_virtual_functions],
152154
[]>;
153155
// This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT
154156
// match.

llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ getSYCLESIMDSplitStatusFromMetadata(const Module &M) {
3737
assert(MDOp && "Unexpected metadata operand");
3838
const auto &MDConst = MDOp->getOperand(0);
3939
auto *MDVal = mdconst::dyn_extract_or_null<ConstantInt>(MDConst);
40+
assert(MDVal && "Unexpected metadata operand type");
4041
uint8_t Val = MDVal->getZExtValue();
4142
assert(Val < 3 && "Unexpected value for split metadata");
4243
auto AsEnum = static_cast<module_split::SyclEsimdSplitStatus>(Val);

sycl/ReleaseNotes.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ Release notes for commit range
3939
when compiling NVPTX. intel/llvm#14621
4040
- Added support for `::rand` and `::srand` in device code on Intel devices. intel/llvm#13506
4141
- Added support for `sm90a` CUDA target architecture. intel/llvm#14075
42-
- Added support for detecting misaligned data accesses via address sanitizer. intell/llvm#14148
42+
- Added support for detecting misaligned data accesses via address sanitizer. intel/llvm#14148
4343
- Added support for emitting multiple error reports via address sanitizer
4444
through `-fsanitize-recover=address`. intel/llvm#13948
4545
- Added initial support for
@@ -175,7 +175,7 @@ Release notes for commit range
175175
Please note that this functionality relies on the compile knowing which
176176
targets support which optional kernel features and that database is not yet
177177
fully complete. In particular, data for Lunar Lake and Battlemage Intel GPUs
178-
is still missing. intel/llvm#14590 intel/llvm#14188 intel/lvm#12727
178+
is still missing. intel/llvm#14590 intel/llvm#14188 intel/llvm#12727
179179
intel/llvm#14757 intel/llvm#13486 intel/llvm#13974 intel/llvm#13617
180180
- Enhanced compiler to annotate SYCL kernel arguments passed by value with
181181
`__grid_constant__` for CUDA backend. intel/llvm#14322
@@ -273,7 +273,7 @@ Release notes for commit range
273273
- Relaxed diagnostic about using virtual functions in SYCL kernels: now it is
274274
only emitted if a call is perfomed using virtual call mechanism, but it is not
275275
emitted for non-virtual calls of virtual functions. See also
276-
KhronosGroup/SYCL-Docs#565. intel/llvm#114051 intel/llvm#14141
276+
KhronosGroup/SYCL-Docs#565. intel/llvm#14051 intel/llvm#14141
277277
- ESIMD API `inv` was extended to support `double` arguments. intel/llvm#13838
278278
- Enhanced validation (via `static_assert` mechanism) of template arguments of
279279
ESIMD `rdregion` and `wrregion` APIs. intel/llvm#13158
@@ -484,7 +484,7 @@ Release notes for commit range
484484
extension. intel/llvm#14522
485485
- Fixed a bug where defining kernel as a named functor whilst using
486486
`-fno-sycl-unnamed-lambda` would lead to a compilation error about unnamed
487-
lambdas being unsupported. intel/lvm#14614
487+
lambdas being unsupported. intel/llvm#14614
488488
- Fixed an issue on CUDA & AMDGPU backends where `multi_ptr` relational
489489
operators taking `std::nullptr_t` would produce different results to their
490490
corresponding standard C++ helpers like `std::less`. intel/llvm#13201

sycl/cmake/modules/FetchUnifiedRuntime.cmake

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,14 @@ if(SYCL_UR_USE_FETCH_CONTENT)
117117
endfunction()
118118

119119
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
120-
# commit b79ebe4e98789144bcdf3832088eb6e6b5ae6967
121-
# Merge: 7b4bc761 fbb6e862
122-
# Author: Kenneth Benzie (Benie) <[email protected]>
123-
# Date: Fri Oct 4 16:39:59 2024 +0100
124-
# Merge pull request #2018 from wenju-he/L0-bindless-image-device-query
125-
# [L0] Fix device query of bindless image support
126-
set(UNIFIED_RUNTIME_TAG b79ebe4e98789144bcdf3832088eb6e6b5ae6967)
120+
# commit df6da35d6e67f2383db28dd49ab08c5c0ef541d2
121+
# Merge: 67590533 55bd5636
122+
# Author: aarongreig <[email protected]>
123+
# Date: Mon Oct 7 12:28:07 2024 +0100
124+
# Merge pull request #2038 from GeorgeWeb/georgi/unsupported-max-coop-wgsize
125+
# [UR][hip][opencl] Mark urKernelSuggestMaxCooperativeGroupCountExp as unsupported
126+
# instead of returning misleading default value
127+
set(UNIFIED_RUNTIME_TAG df6da35d6e67f2383db28dd49ab08c5c0ef541d2)
127128

128129
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
129130
# Due to the use of dependentloadflag and no installer for UMF and hwloc we need

sycl/doc/design/DeviceIf.md

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -169,21 +169,21 @@ one of the following corresponding C++ macro names:
169169
* `__SYCL_TARGET_INTEL_GPU_ACM_G11__`
170170
* `__SYCL_TARGET_INTEL_GPU_ACM_G12__`
171171
* `__SYCL_TARGET_INTEL_GPU_PVC__`
172-
* `__SYCL_TARGET_NVIDIA_GPU_SM50__`
173-
* `__SYCL_TARGET_NVIDIA_GPU_SM52__`
174-
* `__SYCL_TARGET_NVIDIA_GPU_SM53__`
175-
* `__SYCL_TARGET_NVIDIA_GPU_SM60__`
176-
* `__SYCL_TARGET_NVIDIA_GPU_SM61__`
177-
* `__SYCL_TARGET_NVIDIA_GPU_SM62__`
178-
* `__SYCL_TARGET_NVIDIA_GPU_SM70__`
179-
* `__SYCL_TARGET_NVIDIA_GPU_SM72__`
180-
* `__SYCL_TARGET_NVIDIA_GPU_SM75__`
181-
* `__SYCL_TARGET_NVIDIA_GPU_SM80__`
182-
* `__SYCL_TARGET_NVIDIA_GPU_SM86__`
183-
* `__SYCL_TARGET_NVIDIA_GPU_SM87__`
184-
* `__SYCL_TARGET_NVIDIA_GPU_SM89__`
185-
* `__SYCL_TARGET_NVIDIA_GPU_SM90__`
186-
* `__SYCL_TARGET_NVIDIA_GPU_SM90A__`
172+
* `__SYCL_TARGET_NVIDIA_GPU_SM_50__`
173+
* `__SYCL_TARGET_NVIDIA_GPU_SM_52__`
174+
* `__SYCL_TARGET_NVIDIA_GPU_SM_53__`
175+
* `__SYCL_TARGET_NVIDIA_GPU_SM_60__`
176+
* `__SYCL_TARGET_NVIDIA_GPU_SM_61__`
177+
* `__SYCL_TARGET_NVIDIA_GPU_SM_62__`
178+
* `__SYCL_TARGET_NVIDIA_GPU_SM_70__`
179+
* `__SYCL_TARGET_NVIDIA_GPU_SM_72__`
180+
* `__SYCL_TARGET_NVIDIA_GPU_SM_75__`
181+
* `__SYCL_TARGET_NVIDIA_GPU_SM_80__`
182+
* `__SYCL_TARGET_NVIDIA_GPU_SM_86__`
183+
* `__SYCL_TARGET_NVIDIA_GPU_SM_87__`
184+
* `__SYCL_TARGET_NVIDIA_GPU_SM_89__`
185+
* `__SYCL_TARGET_NVIDIA_GPU_SM_90__`
186+
* `__SYCL_TARGET_NVIDIA_GPU_SM_90A__`
187187
* `__SYCL_TARGET_AMD_GPU_GFX700__`
188188
* `__SYCL_TARGET_AMD_GPU_GFX701__`
189189
* `__SYCL_TARGET_AMD_GPU_GFX702__`

0 commit comments

Comments
 (0)