diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 0f2c33585884f..ce2b4a5f6f2e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,6 +562,11 @@ class AMDGPURewriteAGPRCopyMFMAPass void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +struct AMDGPUUniformIntrinsicCombinePass + : public PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 9449e70930913..a6074eaf78fd0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) +MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 280fbe20667c6..2b37c6fa072e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -526,6 +526,11 @@ static cl::opt HasClosedWorldAssumption( cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden); +static cl::opt EnableUniformIntrinsicCombine( + "amdgpu-enable-uniform-intrinsic-combine", + cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheR600Target()); @@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); + + if (EnableUniformIntrinsicCombine) + PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp new file mode 100644 index 0000000000000..50c78d8c67251 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -0,0 +1,159 @@ +//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass simplifies certain intrinsic calls when the arguments are uniform. +/// It's true that this pass has transforms that can lead to a situation where +/// some instruction whose operand was previously recognized as statically +/// uniform is later on no longer recognized as statically uniform. However, the +/// semantics of how programs execute don't (and must not, for this precise +/// reason) care about static uniformity, they only ever care about dynamic +/// uniformity. And every instruction that's downstream and cares about dynamic +/// uniformity must be convergent (and isel will introduce v_readfirstlane for +/// them if their operands can't be proven statically uniform). +/// +/// This pass is implemented as a ModulePass because intrinsic declarations +/// exist at the module scope, allowing us to skip processing entirely if no +/// declarations are present and to traverse their user lists directly when +/// they are. A FunctionPass would instead require scanning every instruction +/// in every function to find relevant intrinsics, which is far less efficient. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine" + +using namespace llvm; +using namespace llvm::AMDGPU; +using namespace llvm::PatternMatch; + +/// Wrapper for querying uniformity info that first checks locally tracked +/// instructions. +static bool +isDivergentUseWithNew(const Use &U, const UniformityInfo &UI, + const ValueMap &Tracker) { + Value *V = U.get(); + if (auto It = Tracker.find(V); It != Tracker.end()) + return !It->second; // divergent if marked false + return UI.isDivergentUse(U); +} + +/// Optimizes uniform intrinsics calls if their operand can be proven uniform. +static bool optimizeUniformIntrinsic(IntrinsicInst &II, + const UniformityInfo &UI, + ValueMap &Tracker) { + llvm::Intrinsic::ID IID = II.getIntrinsicID(); + + switch (IID) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n'); + II.replaceAllUsesWith(Src); + II.eraseFromParent(); + return true; + } + case Intrinsic::amdgcn_ballot: { + Value *Src = II.getArgOperand(0); + if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) + return false; + LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n'); + + bool Changed = false; + for (User *U : make_early_inc_range(II.users())) { + if (auto *ICmp = dyn_cast(U)) { + Value *Op0 = ICmp->getOperand(0); + Value *Op1 = ICmp->getOperand(1); + ICmpInst::Predicate Pred = ICmp->getPredicate(); + Value *OtherOp = Op0 == &II ? Op1 : Op0; + + if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) { + // Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1 + Instruction *NotOp = + BinaryOperator::CreateNot(Src, "", ICmp->getIterator()); + Tracker[NotOp] = true; // NOT preserves uniformity + LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); + ICmp->replaceAllUsesWith(NotOp); + ICmp->eraseFromParent(); + Changed = true; + } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { + // Case: (icmp ne %ballot, 0) -> %ballot_arg + LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " + << *Src << '\n'); + ICmp->replaceAllUsesWith(Src); + ICmp->eraseFromParent(); + Changed = true; + } + } + } + // Erase the intrinsic if it has no remaining uses. + if (II.use_empty()) + II.eraseFromParent(); + return Changed; + } + default: + llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + } + return false; +} + +/// Iterates over intrinsic declarations in the module to optimize their uses. +static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { + bool IsChanged = false; + ValueMap Tracker; + + FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + for (Function &F : M) { + switch (F.getIntrinsicID()) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_ballot: + break; + default: + continue; + } + + for (User *U : make_early_inc_range(F.users())) { + auto *II = cast(U); + Function *ParentF = II->getFunction(); + const auto &UI = FAM.getResult(*ParentF); + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); + } + } + return IsChanged; +} + +PreservedAnalyses +AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { + if (!runUniformIntrinsicCombine(M, AM)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index aae56eef73edd..13f727b68c0d9 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp + AMDGPUUniformIntrinsicCombine.cpp AMDGPUInstrInfo.cpp AMDGPUInstructionSelector.cpp AMDGPUISelDAGToDAG.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll new file mode 100644 index 0000000000000..6c4f504f3456c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll @@ -0,0 +1,452 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O3 -S < %s | FileCheck %s -check-prefix=O3-CHECK + +define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]] +; CURRENT-CHECK: [[IF_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 %ballot, 0 ; in this case is_done = !not_done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]] +; CURRENT-CHECK: [[IF_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 0, %ballot ; in this case is_done = !not_done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]] +; CURRENT-CHECK: [[WHILE]]: +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %done) + %is_done = icmp ne i64 0, %ballot ; in this case is_done = done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]] +; CURRENT-CHECK: [[WHILE]]: +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %done) + %is_done = icmp ne i64 %ballot, 0 ; in this case is_done = done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_uniform_waterfall( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[WORK_PEEL:.*]] +; CURRENT-CHECK: [[WORK_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_uniform_waterfall( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0 +; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]] +; PASS-CHECK: [[WORK]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[TAIL]] +; PASS-CHECK: [[TAIL]]: +; PASS-CHECK-NEXT: [[NEW_DONE]] = phi i1 [ true, %[[WORK]] ], [ false, %[[IF]] ] +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_uniform_waterfall( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ false, %entry ], [ %new_done, %tail ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 %ballot, 0 + br i1 %is_done, label %exit, label %if + +if: + %first_active_id = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 0) + %is_first_active_id = icmp eq i32 0, %first_active_id + br i1 %is_first_active_id, label %work, label %tail + +work: + store i32 5, ptr addrspace(1) %out + br label %tail + +tail: + %new_done = phi i1 [ true, %work ], [ false, %if ] + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i32 %mymask) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @uniform_waterfall( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]], i32 [[MYMASK:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[TMP0]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[WORK_PEEL:.*]] +; CURRENT-CHECK: [[WORK_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @uniform_waterfall( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[MYMASK:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]] +; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]] +; PASS-CHECK: [[WORK]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[TAIL]] +; PASS-CHECK: [[TAIL]]: +; PASS-CHECK-NEXT: [[NEW_DONE]] = phi i1 [ true, %[[WORK]] ], [ false, %[[IF]] ] +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @uniform_waterfall( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[MYMASK:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ false, %entry ], [ %new_done, %tail ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 %ballot, 0 + br i1 %is_done, label %exit, label %if + +if: + %first_active_id = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 %mymask) + %is_first_active_id = icmp eq i32 %mymask, %first_active_id + br i1 %is_first_active_id, label %work, label %tail + +work: + store i32 5, ptr addrspace(1) %out + br label %tail + +tail: + %new_done = phi i1 [ true, %work ], [ false, %if ] + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: [[BALLOT_PEEL:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_PEEL:%.*]] = icmp eq i32 [[BALLOT_PEEL]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_PEEL]], label %[[EXIT:.*]], label %[[IF_PEEL:.*]] +; CURRENT-CHECK: [[IF_PEEL]]: +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[EXIT]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %not_done = xor i1 %done, true + %ballot = tail call i32 @llvm.amdgcn.ballot.i32(i1 %not_done) + %is_done = icmp eq i32 %ballot, 0 ; in this case is_done = !not_done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[ENTRY:.*:]] +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: br label %[[WHILE:.*]] +; CURRENT-CHECK: [[WHILE]]: +; CURRENT-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CURRENT-CHECK-NEXT: [[IS_DONE_NOT:%.*]] = icmp eq i32 [[BALLOT]], 0 +; CURRENT-CHECK-NEXT: br i1 [[IS_DONE_NOT]], label %[[WHILE]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]] +; CURRENT-CHECK: [[EXIT]]: +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: br label %[[WHILE:.*]] +; PASS-CHECK: [[WHILE]]: +; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] +; PASS-CHECK: [[IF]]: +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: br label %[[WHILE]] +; PASS-CHECK: [[EXIT]]: +; PASS-CHECK-NEXT: ret void +; +; O3-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32( +; O3-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; O3-CHECK-NEXT: [[ENTRY:.*:]] +; O3-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; O3-CHECK-NEXT: ret void +; +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %ballot = tail call i32 @llvm.amdgcn.ballot.i32(i1 %done) + %is_done = icmp ne i32 0, %ballot ; in this case is_done = done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} + +declare i64 @llvm.amdgcn.ballot.i64(i1) #1 +!6 = !{i64 690} +!7 = distinct !{!7, !8} +!8 = !{!"llvm.loop.mustprogress"} +;. +; CURRENT-CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CURRENT-CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1} +; CURRENT-CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} +; CURRENT-CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll new file mode 100644 index 0000000000000..aa11574517520 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll @@ -0,0 +1,790 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-enable-uniform-intrinsic-combine=0 -O3 -S < %s | FileCheck %s -check-prefix=CURRENT-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,dce -S < %s | FileCheck %s -check-prefix=DCE-CHECK + +define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CURRENT-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; PASS-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_constant( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; DCE-CHECK-NEXT: store i32 77, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.permlane64(i32 77) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_uniform( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 [[SRC]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.permlane64(i32 %src) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane64(i32 %tid) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TID2:%.*]] = add nuw nsw i32 [[TID]], 1 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_nonuniform_expression( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid2 = add i32 %tid, 1 + %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_constant( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_constant( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_constant( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_nonuniform_indices(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_indices( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_nonuniform_workitem(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TIDX]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_workitem( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readlane_nonuniform_expression(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CURRENT-CHECK-NEXT: [[TIDX2:%.*]] = add nuw nsw i32 [[TIDX]], 1 +; CURRENT-CHECK-NEXT: [[TIDY2:%.*]] = add nuw nsw i32 [[TIDY]], 2 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TIDX]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; PASS-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1 +; PASS-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_nonuniform_expression( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; DCE-CHECK-NEXT: [[TIDX2:%.*]] = add i32 [[TIDX]], 1 +; DCE-CHECK-NEXT: [[TIDY2:%.*]] = add i32 [[TIDY]], 2 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TIDX]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %tidx2 = add i32 %tidx, 1 + %tidy2 = add i32 %tidy, 2 + %v = call i32 @llvm.amdgcn.readlane(i32 %tidx2, i32 %tidy2) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.readfirstlane(i32 7) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i32 %src0) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]], i32 [[SRC0:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_with_workitem_id(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_workitem_id( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readfirstlane_expression(i32 addrspace(1)* %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TID2:%.*]] = add nuw nsw i32 [[TID]], 1 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]]) +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TID2]] to i64 +; CURRENT-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i64 [[TMP1]] +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]]) +; PASS-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]] +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_expression( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TID2:%.*]] = add i32 [[TID]], 1 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID2]]) +; DCE-CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT]], i32 [[TID2]] +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid2 = add i32 %tid, 1 + %v = call i32 @llvm.amdgcn.readfirstlane(i32 %tid2) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5) + %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]]) +; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]]) +; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_with_firstlane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TIDX]]) +; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx) + %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() +; CURRENT-CHECK-NEXT: [[V1:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; CURRENT-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_readlane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2) + store i32 %v2, ptr addrspace(1) %out + ret void +} + + +define amdgpu_kernel void @permlane64_boundary(ptr addrspace(1) %out_min, ptr addrspace(1) %out_max) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT_MIN:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT_MAX:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4 +; CURRENT-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4 +; PASS-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @permlane64_boundary( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT_MIN:%.*]], ptr addrspace(1) [[OUT_MAX:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: store i32 -2147483648, ptr addrspace(1) [[OUT_MIN]], align 4 +; DCE-CHECK-NEXT: store i32 2147483647, ptr addrspace(1) [[OUT_MAX]], align 4 +; DCE-CHECK-NEXT: ret void +; + %min_v = call i32 @llvm.amdgcn.permlane64(i32 -2147483648) + store i32 %min_v, ptr addrspace(1) %out_min + %max_v = call i32 @llvm.amdgcn.permlane64(i32 2147483647) + store i32 %max_v, ptr addrspace(1) %out_max + ret void +} + +define amdgpu_kernel void @readlane_cross_lane(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[TIDX:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[TIDY:%.*]] = add nuw nsw i32 [[TIDX]], 5 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_cross_lane( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[TIDY:%.*]] = add i32 [[TIDX]], 5 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]]) +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = add i32 %tidx, 5 + %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CURRENT-CHECK-NEXT: store i32 435, ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456 +; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456 +; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %random = xor i32 123, 456 + %v = call i32 @llvm.amdgcn.readfirstlane(i32 %random) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_expression(ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @readlane_expression( +; CURRENT-CHECK-SAME: ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[IDX1:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CURRENT-CHECK-NEXT: [[IDX2:%.*]] = shl nuw nsw i32 [[IDX1]], 1 +; CURRENT-CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]]) +; CURRENT-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @readlane_expression( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2 +; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]]) +; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @readlane_expression( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[IDX1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; DCE-CHECK-NEXT: [[IDX2:%.*]] = mul i32 [[IDX1]], 2 +; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[IDX1]], i32 [[IDX2]]) +; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 +; DCE-CHECK-NEXT: ret void +; + %idx1 = call i32 @llvm.amdgcn.workitem.id.x() + %idx2 = mul i32 %idx1, 2 + %v = call i32 @llvm.amdgcn.readlane(i32 %idx1, i32 %idx2) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i32( +; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; CURRENT-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]]) +; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0 +; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32( +; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i32( +; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 +; DCE-CHECK-NEXT: ret void +; + %c = trunc i32 %v to i1 + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) + %ballot_ne_zero = icmp ne i32 %ballot, 0 + store i1 %ballot_ne_zero, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @ballot_i64( +; CURRENT-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 1)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CURRENT-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; CURRENT-CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[C]]) +; CURRENT-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[TMP1]], 0 +; CURRENT-CHECK-NEXT: store i1 [[BALLOT_NE_ZERO]], ptr addrspace(1) [[OUT]], align 1 +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64( +; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @ballot_i64( +; DCE-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; DCE-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 +; DCE-CHECK-NEXT: ret void +; + %c = trunc i32 %v to i1 + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) + %ballot_ne_zero = icmp ne i64 %ballot, 0 + store i1 %ballot_ne_zero, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_readlane_i16(i16 %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16( +; CURRENT-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16( +; PASS-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i16( +; DCE-CHECK-SAME: i16 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call i16 @llvm.amdgcn.readlane.i16(i16 %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(i16 %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_i64(i64 %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64( +; CURRENT-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64( +; PASS-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_i64( +; DCE-CHECK-SAME: i64 [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(i64 %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_bf16(bfloat %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16( +; CURRENT-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16( +; PASS-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_bf16( +; DCE-CHECK-SAME: bfloat [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(bfloat %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_f16(half %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16( +; CURRENT-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16( +; PASS-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f16( +; DCE-CHECK-SAME: half [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call half @llvm.amdgcn.readlane.f16(half %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(half %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_f32(float %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32( +; CURRENT-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32( +; PASS-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f32( +; DCE-CHECK-SAME: float [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call float @llvm.amdgcn.readlane.f32(float %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(float %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_f64(double %src0, i32 %src1) { +; CURRENT-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64( +; CURRENT-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64( +; PASS-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define amdgpu_kernel void @test_readlane_f64( +; DCE-CHECK-SAME: double [[SRC0:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(double %readlane) + ret void +} +; All such cases can be optimised, given generic way to query getDeclarationIfExists() +define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) { +; CURRENT-CHECK-LABEL: define void @test_readlane_v8i16( +; CURRENT-CHECK-SAME: ptr addrspace(1) readnone captures(none) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CURRENT-CHECK-NEXT: [[X:%.*]] = tail call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]]) +; CURRENT-CHECK-NEXT: tail call void asm sideeffect " +; CURRENT-CHECK-NEXT: ret void +; +; PASS-CHECK-LABEL: define void @test_readlane_v8i16( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; PASS-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]]) +; PASS-CHECK-NEXT: call void asm sideeffect " +; PASS-CHECK-NEXT: ret void +; +; DCE-CHECK-LABEL: define void @test_readlane_v8i16( +; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], <8 x i16> [[SRC:%.*]], i32 [[SRC1:%.*]]) #[[ATTR0]] { +; DCE-CHECK-NEXT: [[X:%.*]] = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> [[SRC]], i32 [[SRC1]]) +; DCE-CHECK-NEXT: call void asm sideeffect " +; DCE-CHECK-NEXT: ret void +; + %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<8 x i16> %x) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll new file mode 100644 index 0000000000000..2fde3e3759f47 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-temporal-divergence.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=COMB-CHECK + +; This should not be optimized +define amdgpu_cs void @temporal_divergence(ptr addrspace(1) %out, i32 %n) { +; PASS-CHECK-LABEL: define amdgpu_cs void @temporal_divergence( +; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; PASS-CHECK-NEXT: [[ENTRY:.*]]: +; PASS-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; PASS-CHECK-NEXT: br label %[[H:.*]] +; PASS-CHECK: [[H]]: +; PASS-CHECK-NEXT: [[UNI_MERGE_H:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[UNI_INC:%.*]], %[[H]] ] +; PASS-CHECK-NEXT: [[UNI_INC]] = add i32 [[UNI_MERGE_H]], 1 +; PASS-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0 +; PASS-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]] +; PASS-CHECK: [[X]]: +; PASS-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[UNI_INC]]) +; PASS-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[UNI_JOIN]], 5 +; PASS-CHECK-NEXT: store i32 [[JOIN_USER]], ptr addrspace(1) [[OUT]], align 4 +; PASS-CHECK-NEXT: ret void +; +; COMB-CHECK-LABEL: define amdgpu_cs void @temporal_divergence( +; COMB-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; COMB-CHECK-NEXT: [[ENTRY:.*]]: +; COMB-CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; COMB-CHECK-NEXT: br label %[[H:.*]] +; COMB-CHECK: [[H]]: +; COMB-CHECK-NEXT: [[UNI_MERGE_H:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[UNI_INC:%.*]], %[[H]] ] +; COMB-CHECK-NEXT: [[UNI_INC]] = add i32 [[UNI_MERGE_H]], 1 +; COMB-CHECK-NEXT: [[DIV_EXITX:%.*]] = icmp eq i32 [[TID]], 0 +; COMB-CHECK-NEXT: br i1 [[DIV_EXITX]], label %[[X:.*]], label %[[H]] +; COMB-CHECK: [[X]]: +; COMB-CHECK-NEXT: [[UNI_JOIN:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[UNI_INC]]) +; COMB-CHECK-NEXT: [[JOIN_USER:%.*]] = add i32 [[UNI_JOIN]], 5 +; COMB-CHECK-NEXT: store i32 [[JOIN_USER]], ptr addrspace(1) [[OUT]], align 4 +; COMB-CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + br label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ] + %uni.inc = add i32 %uni.merge.h, 1 + %div.exitx = icmp eq i32 %tid, 0 + br i1 %div.exitx, label %X, label %H ; divergent branch + +X: + %uni.join = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %uni.inc) + %join.user = add i32 %uni.join, 5 + store i32 %join.user, ptr addrspace(1) %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.readfirstlane.i32(i32)