Skip to content

Commit b345981

Browse files
jhuber6ronlieb
authored andcommitted
[OpenMP] Use-generic-IR-for-the-OpenMP-DeviceRTL
Change-Id: I98516000f23c7e4809a20b075498e478b0474440
1 parent 7a9d0b9 commit b345981

File tree

17 files changed

+219
-301
lines changed

17 files changed

+219
-301
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,6 +1284,9 @@ OpenMP Support
12841284
- Added support for 'omp assume' directive.
12851285
- Added support for 'omp scope' directive.
12861286
- Added support for allocator-modifier in 'allocate' clause.
1287+
- Changed the OpenMP DeviceRTL to use 'generic' IR. The
1288+
``LIBOMPTARGET_DEVICE_ARCHITECTURES`` CMake argument is now unused and will
1289+
always build support for AMDGPU and NVPTX targets.
12871290

12881291
Improvements
12891292
^^^^^^^^^^^^

clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ const char *amdgpu::dlr::getLinkCommandArgs(
252252
// the look-up of the libomptarget bc lib to happen and if not present
253253
// where it is expected it means we are using the build tree compiler
254254
// not the installed compiler.
255-
std::string LibDeviceName = "/libomptarget-amdgpu-" + GPUArch.str() + ".bc";
255+
std::string LibDeviceName = "/libomptarget-amdgpu.bc";
256256

257257
if (!Args.hasArg(options::OPT_nogpulib)) {
258258
// Check if libomptarget device bitcode can be found in a LIBRARY_PATH dir
@@ -279,7 +279,7 @@ const char *amdgpu::dlr::getLinkCommandArgs(
279279
BCLibs.push_back(Args.MakeArgString(bc_file_lib));
280280
else
281281
TC.getDriver().Diag(diag::err_drv_omp_offload_target_bcruntime_not_found)
282-
<< "libomptarget-amdgpu-" + GPUArch.str() + ".bc";
282+
<< "libomptarget-amdgpu.bc";
283283
}
284284

285285
if (!AsanRTL.empty()) {
@@ -398,7 +398,6 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions(
398398
HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
399399

400400
StringRef GPUArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
401-
assert(!GPUArch.empty() && "Must have an explicit GPU arch.");
402401

403402
assert(DeviceOffloadingKind == Action::OFK_OpenMP &&
404403
"Only OpenMP offloading kinds are supported.");

clang/lib/Driver/ToolChains/CommonArgs.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3040,8 +3040,9 @@ void tools::addOpenMPDeviceRTL(const Driver &D,
30403040
StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx";
30413041
std::string LibOmpTargetName =
30423042
Triple.isAMDGCN()
3043-
? ("libomptarget-old-amdgpu-" + BitcodeSuffix + ".bc").str()
3044-
: ("libomptarget-nvptx-" + BitcodeSuffix + ".bc").str();
3043+
? ("libomptarget-old-amdgpu.bc")
3044+
: ("libomptarget-nvptx.bc");
3045+
30453046

30463047
// First check whether user specifies bc library
30473048
if (const Arg *A = DriverArgs.getLastArg(LibomptargetBCPathOpt)) {

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -877,9 +877,6 @@ void CudaToolChain::addClangTargetOptions(
877877
HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
878878

879879
StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
880-
if (GpuArch.empty())
881-
GpuArch = getProcessorFromTargetID(this->getTriple(), this->getTargetID());
882-
assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
883880
assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
884881
DeviceOffloadingKind == Action::OFK_Cuda) &&
885882
"Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");

clang/test/Driver/openmp-offload-gpu.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@
9494
// RUN: %s 2>&1 | FileCheck -check-prefix=CHK-ENV-BCLIB %s
9595

9696
// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
97-
// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx-sm_52.bc
98-
// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx-sm_52.bc
97+
// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx.bc
98+
// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx.bc
9999
// CHK-BCLIB-NOT: {{error:|warning:}}
100100

101101
/// ###########################################################################

offload/DeviceRTL/CMakeLists.txt

Lines changed: 40 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -42,50 +42,6 @@ set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
4242
set(include_directory ${devicertl_base_directory}/include)
4343
set(source_directory ${devicertl_base_directory}/src)
4444

45-
set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80 86 89 87 90)
46-
set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803"
47-
"gfx9-generic;gfx900;gfx902;gfx906;gfx908"
48-
"gfx90a;gfx90c"
49-
"gfx9-4-generic;gfx940;gfx941;gfx942;gfx950"
50-
"gfx10-1-generic;gfx1010;gfx1012"
51-
"gfx10-3-generic;gfx1030;gfx1031;gfx1032;gfx1033"
52-
"gfx1034;gfx1035;gfx1036"
53-
"gfx11-generic;gfx1100;gfx1101;gfx1102;gfx1103"
54-
"gfx1150;gfx1151;gfx1152;gfx1153"
55-
"gfx12-generic;gfx1200;gfx1201")
56-
set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
57-
"sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90")
58-
set(all_gpu_architectures
59-
"${all_amdgpu_architectures};${all_nvptx_architectures}")
60-
61-
# AMD internal build scripts use LIBOMPTARGET_AMDGCN_GFXLIST
62-
if(DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
63-
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_AMDGCN_GFXLIST} CACHE STRING
64-
"List of device architectures to be used to compile the OpenMP DeviceRTL.")
65-
else()
66-
set(LIBOMPTARGET_DEVICE_ARCHITECTURES "all" CACHE STRING
67-
"List of device architectures to be used to compile the OpenMP DeviceRTL.")
68-
endif()
69-
70-
if(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "all")
71-
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_gpu_architectures})
72-
elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "amdgpu")
73-
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_amdgpu_architectures})
74-
elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "nvptx")
75-
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_nvptx_architectures})
76-
elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
77-
LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "native")
78-
if(NOT LIBOMPTARGET_NVPTX_ARCH AND NOT LIBOMPTARGET_AMDGPU_ARCH)
79-
message(FATAL_ERROR
80-
"Could not find 'amdgpu-arch' and 'nvptx-arch' tools required for 'auto'")
81-
elseif(NOT LIBOMPTARGET_FOUND_NVIDIA_GPU AND NOT LIBOMPTARGET_FOUND_AMDGPU_GPU)
82-
message(FATAL_ERROR "No AMD or NVIDIA GPU found on the system when using 'auto'")
83-
endif()
84-
set(LIBOMPTARGET_DEVICE_ARCHITECTURES
85-
"${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
86-
endif()
87-
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
88-
8945
set(include_files
9046
${include_directory}/Allocator.h
9147
${include_directory}/Configuration.h
@@ -170,20 +126,21 @@ endif()
170126

171127
# first create an object target
172128
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
173-
function(compileDeviceRTLLibrary target_cpu target_name target_triple)
129+
function(compileDeviceRTLLibrary target_name target_triple)
174130
set(target_bc_flags ${ARGN})
175131

176132
set(bc_files "")
177133
foreach(src ${src_files})
178134
get_filename_component(infile ${src} ABSOLUTE)
179135
get_filename_component(outfile ${src} NAME)
180-
set(outfile "${outfile}-${target_cpu}.bc")
181-
set(depfile "${outfile}.d")
136+
set(outfile "${outfile}-${target_name}.bc")
137+
set(depfile "${outfile}-${target_name}.d")
182138

183139
add_custom_target(${outfile}
184140
COMMAND ${CLANG_TOOL}
185141
${bc_flags}
186-
--offload-arch=${target_cpu}
142+
-fopenmp-targets=${target_triple}
143+
-Xopenmp-target=${target_triple} -march=
187144
${target_bc_flags}
188145
-MD -MF ${depfile}
189146
${infile} -o ${outfile}
@@ -194,7 +151,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
194151

195152
if(${outfile} MATCHES "State.cpp")
196153
# Run the prep tool on the library to replace internal attribute with linkonce_odr for dm_alloc only.
197-
set(outfile_prep "${outfile}-${target_cpu}-prep.bc")
154+
set(outfile_prep "${outfile}-${target_name}-prep.bc")
198155
add_custom_target(${outfile_prep}
199156
COMMAND ${PREP_TOOL} -dm ${outfile}
200157
-o ${outfile_prep}
@@ -220,21 +177,21 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
220177
if("${target_name}" STREQUAL "amdgpu")
221178
if(OPENMP_STANDALONE_BUILD)
222179
list(APPEND extra_bc_files
223-
${CMAKE_BINARY_DIR}/hostexec/libhostexec-${target_cpu}.bc
180+
${CMAKE_BINARY_DIR}/hostexec/libhostexec-${target_name}.bc
224181
)
225182
else()
226183
list(APPEND extra_bc_files
227-
${CMAKE_BINARY_DIR}/offload/hostexec/libhostexec-${target_cpu}.bc
184+
${CMAKE_BINARY_DIR}/offload/hostexec/libhostexec-${target_name}.bc
228185
)
229186
endif()
230-
add_custom_target(libhostexec-${target_cpu}
187+
add_custom_target(libhostexec-${target_name}
231188
DEPENDS ${extra_bc_files}
232189
)
233-
add_dependencies(libhostexec-${target_cpu} libhostexec-${target_cpu}.bc)
234-
set(extra_depends libhostexec-${target_cpu})
190+
add_dependencies(libhostexec-${target_name} libhostexec-${target_name}.bc)
191+
set(extra_depends libhostexec-${target_name})
235192
endif()
236193

237-
set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
194+
set(bclib_name "libomptarget-${target_name}.bc")
238195

239196
# Link to a bitcode library.
240197
add_custom_target(linked_${bclib_name}
@@ -293,9 +250,9 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
293250
add_dependencies(${bclib_name} prep-libomptarget-bc)
294251
endif()
295252

296-
add_dependencies(omptarget.devicertl.all_objs embedded_${target_name}-${target_cpu})
253+
add_dependencies(omptarget.devicertl.all_objs embedded_${target_name})
297254

298-
set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")
255+
set(bclib_target_name "omptarget-${target_name}-bc")
299256
add_custom_target(${bclib_target_name} ALL)
300257
add_dependencies(${bclib_target_name} ${bclib_name})
301258

@@ -321,26 +278,26 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
321278
# Package the bitcode in the bitcode and embed it in an ELF for the static library
322279
add_custom_target(packaged_${bclib_name}
323280
COMMAND ${PACKAGER_TOOL} -o ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
324-
"--image=file=${CMAKE_CURRENT_BINARY_DIR}/opt_${bclib_name},${target_feature},triple=${target_triple},arch=${target_cpu},kind=openmp"
281+
"--image=file=${CMAKE_CURRENT_BINARY_DIR}/opt_${bclib_name},${target_feature},triple=${target_triple},arch=generic,kind=openmp"
325282
COMMENT "Packaging LLVM offloading binary ${bclib_name}.out"
326283
)
327284
add_dependencies(packaged_${bclib_name} opt_${bclib_name})
328285
if("${PACKAGER_TOOL}" STREQUAL "$<TARGET_FILE:clang-offload-packager>")
329286
add_dependencies(packaged_${bclib_name} clang-offload-packager)
330287
endif()
331288

332-
set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}-${target_cpu}.o")
333-
add_custom_target(embedded_${target_name}-${target_cpu}
289+
set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}.o")
290+
add_custom_target(embedded_${target_name}
334291
COMMAND ${CLANG_TOOL} --std=c++17 -c -nostdlib
335292
-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
336293
-o ${output_name}
337294
${source_directory}/Stub.cpp
338295
DEPENDS ${source_directory}/Stub.cpp
339296
BYPRODUCTS ${output_name}
340-
COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}-${target_cpu}.o"
297+
COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}.o"
341298
VERBATIM
342299
)
343-
add_dependencies(embedded_${target_name}-${target_cpu} packaged_${bclib_name})
300+
add_dependencies(embedded_${target_name} packaged_${bclib_name})
344301
if("${CLANG_TOOL}" STREQUAL "$<TARGET_FILE:clang>")
345302
add_dependencies(${output_name} clang)
346303
endif()
@@ -349,10 +306,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
349306
set_property(TARGET omptarget.devicertl.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${output_name})
350307

351308
if (CMAKE_EXPORT_COMPILE_COMMANDS)
352-
set(ide_target_name omptarget-ide-${target_name}-${target_cpu})
309+
set(ide_target_name omptarget-ide-${target_name})
353310
add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
354311
target_compile_options(${ide_target_name} PRIVATE
355-
-fopenmp --offload-arch=${target_cpu} -fopenmp-cuda-mode
312+
-fopenmp -fopenmp-cuda-mode
313+
-fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
356314
-mllvm -openmp-opt-disable
357315
-foffload-lto -fvisibility=hidden --offload-device-only
358316
-nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
@@ -371,32 +329,25 @@ endfunction()
371329
# Generate a Bitcode library for all the gpu architectures the user requested.
372330
add_custom_target(omptarget.devicertl.nvptx)
373331
add_custom_target(omptarget.devicertl.amdgpu)
374-
foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
375-
if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
376-
find_package(AMDDeviceLibs REQUIRED CONFIG
377-
HINTS ${CMAKE_INSTALL_PREFIX}
378-
${CMAKE_BINARY_DIR}/../../tools/rocm-device-libs
379-
PATHS /opt/rocm
380-
)
381-
382-
# Link in the ROCm Device Libraries once the other files have been linked.
383-
get_target_property(ocml_path ocml IMPORTED_LOCATION)
384-
get_target_property(ockl_path ockl IMPORTED_LOCATION)
385-
386-
set(amd_options -Xclang -mcode-object-version=none
387-
-Xclang -mlink-builtin-bitcode -Xclang ${ocml_path}
388-
-Xclang -mlink-builtin-bitcode -Xclang ${ockl_path}
389-
-Wno-linker-warnings # Silence the empty host compilation.
390-
-Xclang -mcode-object-version=none
391-
)
332+
find_package(AMDDeviceLibs REQUIRED CONFIG
333+
HINTS ${CMAKE_INSTALL_PREFIX}
334+
${CMAKE_BINARY_DIR}/../../tools/rocm-device-libs
335+
PATHS /opt/rocm
336+
)
392337

393-
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa ${amd_options})
394-
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
395-
compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
396-
else()
397-
message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
398-
endif()
399-
endforeach()
338+
# Link in the ROCm Device Libraries once the other files have been linked.
339+
get_target_property(ocml_path ocml IMPORTED_LOCATION)
340+
get_target_property(ockl_path ockl IMPORTED_LOCATION)
341+
342+
set(amd_options -Xclang -mcode-object-version=none
343+
-Xclang -mlink-builtin-bitcode -Xclang ${ocml_path}
344+
-Xclang -mlink-builtin-bitcode -Xclang ${ockl_path}
345+
-Wno-linker-warnings # Silence the empty host compilation.
346+
-Xclang -mcode-object-version=none
347+
)
348+
349+
compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa ${amd_options})
350+
compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
400351

401352
# Archive all the object files generated above into a static library
402353
add_library(omptarget.devicertl STATIC)

offload/DeviceRTL/include/Platform.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,8 @@ extern const inline bool __oclc_correctly_rounded_sqrt32 = 1;
3333
// Disable finite math optimizations.
3434
extern const inline bool __oclc_finite_only_opt = 0;
3535

36-
// Disable finite math optimizations.
37-
extern const inline bool __oclc_wavefrontsize64 =
38-
__AMDGCN_WAVEFRONT_SIZE == 64 ? 1 : 0;
36+
// Spoof this to wave64 since we only compile for a single architecture.
37+
extern const inline bool __oclc_wavefrontsize64 = 1;
3938

4039
#if defined(__gfx700__)
4140
extern const inline unsigned __oclc_ISA_version = 7000;
@@ -138,7 +137,9 @@ extern const inline unsigned __oclc_ISA_version = 11003;
138137
#elif defined(__gfx12_generic__)
139138
extern const inline unsigned __oclc_ISA_version = 12000;
140139
#else
141-
#error "Unknown AMDGPU architecture"
140+
// The only thing this controls that we care about is fast FMA.
141+
// FIXME: We need to stop relying on the DeviceRTL math libs this way.
142+
extern const inline unsigned __oclc_ISA_version = 7001;
142143
#endif
143144
}
144145

offload/DeviceRTL/src/ExtraMapping.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,7 @@ static uint32_t __kmpc_impl_smid() {
7171
}
7272

7373
static uint32_t getGenericModeMainThreadId() {
74-
unsigned Mask =
75-
llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>().GV_Warp_Size -
76-
1;
74+
unsigned Mask = __builtin_amdgcn_wavefrontsize() - 1;
7775
return (__kmpc_get_hardware_num_threads_in_block() - 1) & (~Mask);
7876
}
7977

0 commit comments

Comments
 (0)