-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[OpenMP] Use generic IR for the OpenMP DeviceRTL #119091
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,43 +42,6 @@ set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR}) | |
| set(include_directory ${devicertl_base_directory}/include) | ||
| set(source_directory ${devicertl_base_directory}/src) | ||
|
|
||
| set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803" | ||
| "gfx9-generic;gfx900;gfx902;gfx906;gfx908" | ||
| "gfx90a;gfx90c" | ||
| "gfx9-4-generic;gfx940;gfx941;gfx942;gfx950" | ||
| "gfx10-1-generic;gfx1010;gfx1012" | ||
| "gfx10-3-generic;gfx1030;gfx1031;gfx1032;gfx1033" | ||
| "gfx1034;gfx1035;gfx1036" | ||
| "gfx11-generic;gfx1100;gfx1101;gfx1102;gfx1103" | ||
| "gfx1150;gfx1151;gfx1152;gfx1153" | ||
| "gfx12-generic") | ||
| set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" | ||
| "sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90") | ||
| set(all_gpu_architectures | ||
| "${all_amdgpu_architectures};${all_nvptx_architectures}") | ||
|
|
||
| set(LIBOMPTARGET_DEVICE_ARCHITECTURES "all" CACHE STRING | ||
| "List of device architectures to be used to compile the OpenMP DeviceRTL.") | ||
|
|
||
| if(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "all") | ||
| set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_gpu_architectures}) | ||
| elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "amdgpu") | ||
| set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_amdgpu_architectures}) | ||
| elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "nvptx") | ||
| set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_nvptx_architectures}) | ||
| elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR | ||
| LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "native") | ||
| if(NOT LIBOMPTARGET_NVPTX_ARCH AND NOT LIBOMPTARGET_AMDGPU_ARCH) | ||
| message(FATAL_ERROR | ||
| "Could not find 'amdgpu-arch' and 'nvptx-arch' tools required for 'auto'") | ||
| elseif(NOT LIBOMPTARGET_FOUND_NVIDIA_GPU AND NOT LIBOMPTARGET_FOUND_AMDGPU_GPU) | ||
| message(FATAL_ERROR "No AMD or NVIDIA GPU found on the system when using 'auto'") | ||
| endif() | ||
| set(LIBOMPTARGET_DEVICE_ARCHITECTURES | ||
| "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}") | ||
| endif() | ||
| list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES) | ||
|
|
||
| set(include_files | ||
| ${include_directory}/Allocator.h | ||
| ${include_directory}/Configuration.h | ||
|
|
@@ -146,20 +109,22 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden | |
|
|
||
| # first create an object target | ||
| add_library(omptarget.devicertl.all_objs OBJECT IMPORTED) | ||
| function(compileDeviceRTLLibrary target_cpu target_name target_triple) | ||
| function(compileDeviceRTLLibrary target_name target_triple) | ||
| set(target_bc_flags ${ARGN}) | ||
|
|
||
| set(bc_files "") | ||
| foreach(src ${src_files}) | ||
| get_filename_component(infile ${src} ABSOLUTE) | ||
| get_filename_component(outfile ${src} NAME) | ||
| set(outfile "${outfile}-${target_cpu}.bc") | ||
| set(outfile "${outfile}-${target_name}.bc") | ||
| set(depfile "${outfile}.d") | ||
|
|
||
| # Passing an empty CPU to -march= suppressed target specific metadata. | ||
| add_custom_command(OUTPUT ${outfile} | ||
| COMMAND ${CLANG_TOOL} | ||
| ${bc_flags} | ||
| --offload-arch=${target_cpu} | ||
| -fopenmp-targets=${target_triple} | ||
| -Xopenmp-target=${target_triple} -march= | ||
| ${target_bc_flags} | ||
| -MD -MF ${depfile} | ||
| ${infile} -o ${outfile} | ||
|
|
@@ -182,7 +147,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
| list(APPEND bc_files ${outfile}) | ||
| endforeach() | ||
|
|
||
| set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc") | ||
| set(bclib_name "libomptarget-${target_name}.bc") | ||
|
|
||
| # Link to a bitcode library. | ||
| add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} | ||
|
|
@@ -222,7 +187,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
| APPEND) | ||
| endif() | ||
|
|
||
| set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc") | ||
| set(bclib_target_name "omptarget-${target_name}-bc") | ||
| add_custom_target(${bclib_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) | ||
|
|
||
| # Copy library to destination. | ||
|
|
@@ -244,7 +209,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
| # Package the bitcode in the bitcode and embed it in an ELF for the static library | ||
| add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} | ||
| COMMAND ${PACKAGER_TOOL} -o ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} | ||
| "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=${target_cpu},kind=openmp" | ||
| "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=generic,kind=openmp" | ||
| DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} | ||
| COMMENT "Packaging LLVM offloading binary ${bclib_name}.out" | ||
| ) | ||
|
|
@@ -254,14 +219,14 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
| APPEND) | ||
| endif() | ||
|
|
||
| set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}-${target_cpu}.o") | ||
| set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}.o") | ||
| add_custom_command(OUTPUT ${output_name} | ||
| COMMAND ${CLANG_TOOL} --std=c++17 -c -nostdlib | ||
| -Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} | ||
| -o ${output_name} | ||
| ${source_directory}/Stub.cpp | ||
| DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} ${source_directory}/Stub.cpp | ||
| COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}-${target_cpu}.o" | ||
| COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}.o" | ||
| VERBATIM | ||
| ) | ||
| if(TARGET clang) | ||
|
|
@@ -274,11 +239,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
| set_property(TARGET omptarget.devicertl.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${output_name}) | ||
|
|
||
| if (CMAKE_EXPORT_COMPILE_COMMANDS) | ||
| set(ide_target_name omptarget-ide-${target_name}-${target_cpu}) | ||
| set(ide_target_name omptarget-ide-${target_name}) | ||
| add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files}) | ||
| target_compile_options(${ide_target_name} PRIVATE | ||
| -fopenmp --offload-arch=${target_cpu} -fopenmp-cuda-mode | ||
| -mllvm -openmp-opt-disable | ||
| -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march= | ||
| -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable | ||
| -foffload-lto -fvisibility=hidden --offload-device-only | ||
| -nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version | ||
| ) | ||
|
|
@@ -293,18 +258,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
| endif() | ||
| endfunction() | ||
|
|
||
| # Generate a Bitcode library for all the gpu architectures the user requested. | ||
| add_custom_target(omptarget.devicertl.nvptx) | ||
| add_custom_target(omptarget.devicertl.amdgpu) | ||
| foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES}) | ||
| if("${gpu_arch}" IN_LIST all_amdgpu_architectures) | ||
| compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) | ||
| elseif("${gpu_arch}" IN_LIST all_nvptx_architectures) | ||
| compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63) | ||
| else() | ||
| message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'") | ||
| endif() | ||
| endforeach() | ||
| compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) | ||
|
|
||
| add_custom_target(omptarget.devicertl.nvptx) | ||
| compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63) | ||
|
||
|
|
||
| # Archive all the object files generated above into a static library | ||
| add_library(omptarget.devicertl STATIC) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, | |
| } | ||
| } | ||
|
|
||
| #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 | ||
| static uint32_t gpu_irregular_simd_reduce(void *reduce_data, | ||
| ShuffleReductFnTy shflFct) { | ||
| uint32_t size, remote_id, physical_lane_id; | ||
|
|
@@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data, | |
| } while (logical_lane_id % 2 == 0 && size > 1); | ||
| return (logical_lane_id == 0); | ||
| } | ||
| #endif | ||
|
|
||
| static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, | ||
| ShuffleReductFnTy shflFct, | ||
|
|
@@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, | |
| uint32_t NumThreads = omp_get_num_threads(); | ||
| if (NumThreads == 1) | ||
| return 1; | ||
| /* | ||
| * This reduce function handles reduction within a team. It handles | ||
| * parallel regions in both L1 and L2 parallelism levels. It also | ||
| * supports Generic, SPMD, and NoOMP modes. | ||
| * | ||
| * 1. Reduce within a warp. | ||
| * 2. Warp master copies value to warp 0 via shared memory. | ||
| * 3. Warp 0 reduces to a single value. | ||
| * 4. The reduced value is available in the thread that returns 1. | ||
| */ | ||
|
|
||
| #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 | ||
| uint32_t WarpsNeeded = | ||
| (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); | ||
| uint32_t WarpId = mapping::getWarpIdInBlock(); | ||
|
|
||
| // Volta execution model: | ||
| // For the Generic execution mode a parallel region either has 1 thread and | ||
| // beyond that, always a multiple of 32. For the SPMD execution mode we may | ||
| // have any number of threads. | ||
| if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) | ||
| gpu_regular_warp_reduce(reduce_data, shflFct); | ||
| else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. | ||
| gpu_irregular_warp_reduce(reduce_data, shflFct, | ||
| /*LaneCount=*/NumThreads % mapping::getWarpSize(), | ||
| /*LaneId=*/mapping::getThreadIdInBlock() % | ||
| mapping::getWarpSize()); | ||
|
|
||
| // When we have more than [mapping::getWarpSize()] number of threads | ||
| // a block reduction is performed here. | ||
| // | ||
| // Only L1 parallel region can enter this if condition. | ||
| if (NumThreads > mapping::getWarpSize()) { | ||
| // Gather all the reduced values from each warp | ||
| // to the first warp. | ||
| cpyFct(reduce_data, WarpsNeeded); | ||
| // | ||
| // This reduce function handles reduction within a team. It handles | ||
| // parallel regions in both L1 and L2 parallelism levels. It also | ||
| // supports Generic, SPMD, and NoOMP modes. | ||
| // | ||
| // 1. Reduce within a warp. | ||
| // 2. Warp master copies value to warp 0 via shared memory. | ||
| // 3. Warp 0 reduces to a single value. | ||
| // 4. The reduced value is available in the thread that returns 1. | ||
| // | ||
|
|
||
| if (WarpId == 0) | ||
| gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, | ||
| BlockThreadId); | ||
| #if __has_builtin(__nvvm_reflect) | ||
| if (__nvvm_reflect("__CUDA_ARCH") >= 700) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll try to make an AMDGPU counterpart for this one, though it doesn't look like necessary for the purpose of OpenMP device runtime.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe @AlexVlx also has interest in that space, so best consult with him as well. |
||
| uint32_t WarpsNeeded = | ||
| (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); | ||
| uint32_t WarpId = mapping::getWarpIdInBlock(); | ||
|
|
||
| // Volta execution model: | ||
| // For the Generic execution mode a parallel region either has 1 thread and | ||
| // beyond that, always a multiple of 32. For the SPMD execution mode we may | ||
| // have any number of threads. | ||
| if ((NumThreads % mapping::getWarpSize() == 0) || | ||
| (WarpId < WarpsNeeded - 1)) | ||
| gpu_regular_warp_reduce(reduce_data, shflFct); | ||
| else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. | ||
| gpu_irregular_warp_reduce( | ||
| reduce_data, shflFct, | ||
| /*LaneCount=*/NumThreads % mapping::getWarpSize(), | ||
| /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); | ||
|
|
||
| // When we have more than [mapping::getWarpSize()] number of threads | ||
| // a block reduction is performed here. | ||
| // | ||
| // Only L1 parallel region can enter this if condition. | ||
| if (NumThreads > mapping::getWarpSize()) { | ||
| // Gather all the reduced values from each warp | ||
| // to the first warp. | ||
| cpyFct(reduce_data, WarpsNeeded); | ||
|
|
||
| if (WarpId == 0) | ||
| gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, | ||
| BlockThreadId); | ||
| } | ||
| return BlockThreadId == 0; | ||
| } | ||
| return BlockThreadId == 0; | ||
| #else | ||
| #endif | ||
| __kmpc_impl_lanemask_t Liveness = mapping::activemask(); | ||
| if (Liveness == lanes::All) // Full warp | ||
| gpu_regular_warp_reduce(reduce_data, shflFct); | ||
|
|
@@ -150,10 +152,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, | |
| return BlockThreadId == 0; | ||
| } | ||
|
|
||
| // Get the OMP thread Id. This is different from BlockThreadId in the case of | ||
| // an L2 parallel region. | ||
| // Get the OMP thread Id. This is different from BlockThreadId in the case | ||
| // of an L2 parallel region. | ||
| return BlockThreadId == 0; | ||
| #endif // __CUDA_ARCH__ >= 700 | ||
| } | ||
|
|
||
| uint32_t roundToWarpsize(uint32_t s) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and nothing after
-march=?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Intentional
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so which means there is no arch?