diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 4cd32a0502c66..e4d6e4b2f5459 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2109,7 +2109,7 @@ def int_amdgcn_s_quadmask : def int_amdgcn_s_wqm : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; -class AMDGPUWaveReduce : Intrinsic< +class AMDGPUWaveReduce : Intrinsic< [data_ty], [ LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR) @@ -2119,8 +2119,13 @@ class AMDGPUWaveReduce : Intrinsic< ], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg>]>; -def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; -def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; +multiclass AMDGPUWaveReduceGenerator Operations>{ + foreach Opcode = Operations in + def Opcode : AMDGPUWaveReduce; +} + +defvar Operations = ["umin", "min", "fmin", "umax", "max", "fmax", "add", "fadd", "sub", "fsub", "and", "or", "xor"]; +defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator; def int_amdgcn_readfirstlane : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index f408a013d7a37..76c1feb0d5fe0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -751,53 +751,52 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, BasicBlock *ComputeEnd = nullptr; // If we have a divergent value in each lane, we need to combine the value // using DPP. - if (ValDivergent) { - if (ScanImpl == ScanOptions::DPP) { - // First we need to set all inactive invocations to the identity value, so - // that they can correctly contribute to the final result. - NewV = - B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - if (!NeedResult && ST->hasPermLaneX16()) { - // On GFX10 the permlanex16 instruction helps us build a reduction - // without too many readlanes and writelanes, which are generally bad - // for performance. - NewV = buildReduction(B, ScanOp, NewV, Identity); - } else { - NewV = buildScan(B, ScanOp, NewV, Identity); - if (NeedResult) - ExclScan = buildShiftRight(B, NewV, Identity); - // Read the value from the last lane, which has accumulated the values - // of each active lane in the wavefront. This will be our new value - // which we will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, - {NewV, LastLaneIdx}); - } - // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); - } else if (ScanImpl == ScanOptions::Iterative) { - // Alternative implementation for scan - ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); - ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); - std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, - ComputeLoop, ComputeEnd); - } else { - llvm_unreachable("Atomic Optimzer is disabled for None strategy"); - } - } else { + // if (ValDivergent) { + // if (ScanImpl == ScanOptions::DPP) { + // // First we need to set all inactive invocations to the identity value, so + // // that they can correctly contribute to the final result. + // NewV = + // B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + // if (!NeedResult && ST->hasPermLaneX16()) { + // // On GFX10 the permlanex16 instruction helps us build a reduction + // // without too many readlanes and writelanes, which are generally bad + // // for performance. + // NewV = buildReduction(B, ScanOp, NewV, Identity); + // } else { + // NewV = buildScan(B, ScanOp, NewV, Identity); + // if (NeedResult) + // ExclScan = buildShiftRight(B, NewV, Identity); + // // Read the value from the last lane, which has accumulated the values + // // of each active lane in the wavefront. This will be our new value + // // which we will provide to the atomic operation. + // Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + // NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, + // {NewV, LastLaneIdx}); + // } + // // Finally mark the readlanes in the WWM section. + // NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); + // } else if (ScanImpl == ScanOptions::Iterative) { + // // Alternative implementation for scan + // ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); + // ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); + // std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, + // ComputeLoop, ComputeEnd); + // } else { + // llvm_unreachable("Atomic Optimzer is disabled for None strategy"); + // } + // } else { + // **************************************** Implement from here switch (Op) { + // TODO --implement for floats default: llvm_unreachable("Unhandled atomic op"); case AtomicRMWInst::Add: - case AtomicRMWInst::Sub: { - // The new value we will be contributing to the atomic operation is the - // old value times the number of active lanes. - Value *const Ctpop = B.CreateIntCast( - B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); - NewV = buildMul(B, V, Ctpop); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_add, Int32Ty, {V, B.getInt32(0)}); + break; + case AtomicRMWInst::Sub: + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_sub, Int32Ty, {V, B.getInt32(0)}); break; - } case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: { Value *const Ctpop = B.CreateIntCast( @@ -807,28 +806,39 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, break; } case AtomicRMWInst::And: + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_and, Int32Ty, {V, B.getInt32(0)}); + break; case AtomicRMWInst::Or: + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_or, Int32Ty, {V, B.getInt32(0)}); + break; + case AtomicRMWInst::Xor: + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_xor, Int32Ty, {V, B.getInt32(0)}); + break; case AtomicRMWInst::Max: + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_max, Int32Ty, {V, B.getInt32(0)}); + break; case AtomicRMWInst::Min: + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_min, Int32Ty, {V, B.getInt32(0)}); + break; case AtomicRMWInst::UMax: + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, Int32Ty, {V, B.getInt32(0)}); + break; case AtomicRMWInst::UMin: + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)}); + break; case AtomicRMWInst::FMin: case AtomicRMWInst::FMax: // These operations with a uniform value are idempotent: doing the atomic // operation multiple times has the same effect as doing it once. - NewV = V; + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)}); break; - case AtomicRMWInst::Xor: - // The new value we will be contributing to the atomic operation is the - // old value times the parity of the number of active lanes. - Value *const Ctpop = B.CreateIntCast( - B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); - NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1)); - break; } - } + + // **************************************** Implement to here + + // NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)}); // We only want a single lane to enter our new control flow, and we do this // by checking if there are any active lanes below us. Only one lane will // have 0 active lanes below us, so that will be the only one to progress. @@ -854,39 +864,40 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // ComputeEnd block. We also need to set up predecessor to next block when // single lane done updating the final reduced value. BasicBlock *Predecessor = nullptr; - if (ValDivergent && ScanImpl == ScanOptions::Iterative) { - // Move terminator from I's block to ComputeEnd block. - // - // OriginalBB is known to have a branch as terminator because - // SplitBlockAndInsertIfThen will have inserted one. - BranchInst *Terminator = cast(OriginalBB->getTerminator()); - B.SetInsertPoint(ComputeEnd); - Terminator->removeFromParent(); - B.Insert(Terminator); - - // Branch to ComputeLoop Block unconditionally from the I's block for - // iterative approach. - B.SetInsertPoint(OriginalBB); - B.CreateBr(ComputeLoop); - - // Update the dominator tree for new control flow. - SmallVector DomTreeUpdates( - {{DominatorTree::Insert, OriginalBB, ComputeLoop}, - {DominatorTree::Insert, ComputeLoop, ComputeEnd}}); - - // We're moving the terminator from EntryBB to ComputeEnd, make sure we move - // the DT edges as well. - for (auto *Succ : Terminator->successors()) { - DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ}); - DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ}); - } - - DTU.applyUpdates(DomTreeUpdates); - - Predecessor = ComputeEnd; - } else { - Predecessor = OriginalBB; - } + // if (ValDivergent && ScanImpl == ScanOptions::Iterative) { + // // Move terminator from I's block to ComputeEnd block. + // // + // // OriginalBB is known to have a branch as terminator because + // // SplitBlockAndInsertIfThen will have inserted one. + // BranchInst *Terminator = cast(OriginalBB->getTerminator()); + // B.SetInsertPoint(ComputeEnd); + // Terminator->removeFromParent(); + // B.Insert(Terminator); + + // // Branch to ComputeLoop Block unconditionally from the I's block for + // // iterative approach. + // B.SetInsertPoint(OriginalBB); + // B.CreateBr(ComputeLoop); + + // // Update the dominator tree for new control flow. + // SmallVector DomTreeUpdates( + // {{DominatorTree::Insert, OriginalBB, ComputeLoop}, + // {DominatorTree::Insert, ComputeLoop, ComputeEnd}}); + + // // We're moving the terminator from EntryBB to ComputeEnd, make sure we move + // // the DT edges as well. + // for (auto *Succ : Terminator->successors()) { + // DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ}); + // DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ}); + // } + + // DTU.applyUpdates(DomTreeUpdates); + + // Predecessor = ComputeEnd; + // } else { + // Predecessor = OriginalBB; + // } + Predecessor = OriginalBB; // Move the IR builder into single_lane next. B.SetInsertPoint(SingleLaneTerminator); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index bc771d4ef6c08..24c6dc0afbce5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4846,8 +4846,19 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); break; } - case Intrinsic::amdgcn_wave_reduce_umin: - case Intrinsic::amdgcn_wave_reduce_umax: { + case Intrinsic::amdgcn_wave_reduce_add: + case Intrinsic::amdgcn_wave_reduce_fadd: + case Intrinsic::amdgcn_wave_reduce_sub: + case Intrinsic::amdgcn_wave_reduce_fsub: + case Intrinsic::amdgcn_wave_reduce_min: + case Intrinsic::amdgcn_wave_reduce_umin: + case Intrinsic::amdgcn_wave_reduce_fmin: + case Intrinsic::amdgcn_wave_reduce_max: + case Intrinsic::amdgcn_wave_reduce_umax: + case Intrinsic::amdgcn_wave_reduce_fmax: + case Intrinsic::amdgcn_wave_reduce_and: + case Intrinsic::amdgcn_wave_reduce_or: + case Intrinsic::amdgcn_wave_reduce_xor: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 885ecab891b1f..4ffcee15225cd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4861,10 +4861,141 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, Register DstReg = MI.getOperand(0).getReg(); MachineBasicBlock *RetBB = nullptr; if (isSGPR) { - // These operations with a uniform value i.e. SGPR are idempotent. - // Reduced value will be same as given sgpr. - BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); - RetBB = &BB; + switch(Opc){ + case AMDGPU::S_MIN_U32: + case AMDGPU::S_MIN_I32: + case AMDGPU::S_MAX_U32: + case AMDGPU::S_MAX_I32: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32:{ + // These operations with a uniform value i.e. SGPR are idempotent. + // Reduced value will be same as given sgpr. + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); + RetBB = &BB; + break; + } + case AMDGPU::S_XOR_B32: + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32:{ + // MachineBasicBlock::iterator I = BB.end(); + // Register SrcReg = MI.getOperand(1).getReg(); + + // // Create Control flow for loop + // // Split MI's Machine Basic block into For loop + // auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true); + + // // Create virtual registers required for lowering. + const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); + const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); + Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); + // Register InitalValReg = MRI.createVirtualRegister(DstRegClass); + + // Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass); + // Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); + // Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); + + // Register FF1Reg = MRI.createVirtualRegister(DstRegClass); + Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass); + + bool IsWave32 = ST.isWave32(); + unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64; + + // Create initail values of induction variable from Exec, Accumulator and + // insert branch instr to newly created ComputeBlock + // uint32_t InitalValue = 0; + + auto Exec = + BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg); + + auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg) + .addReg(Exec->getOperand(0).getReg()); + + // BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) + // .addImm(InitalValue); + // BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop); + + // // Start constructing ComputeLoop + // I = ComputeLoop->end(); + // auto Accumulator = + // BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) + // .addReg(InitalValReg) + // .addMBB(&BB); + // auto ActiveBits = + // BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) + // .addReg(TmpSReg->getOperand(0).getReg()) + // .addMBB(&BB); + + // // Perform the computations + // unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; + // auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) + // .addReg(ActiveBits->getOperand(0).getReg()); + + // // Manipulate the iterator to get the next active lane + // unsigned BITSETOpc = + // IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; + // auto NewActiveBits = + // BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) + // .addReg(FF1->getOperand(0).getReg()) + // .addReg(ActiveBits->getOperand(0).getReg()); + + // // Add phi nodes + // Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) + // .addMBB(ComputeLoop); + // ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()) + // .addMBB(ComputeLoop); + + // // Creating branching + // unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; + // BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) + // .addReg(NewActiveBits->getOperand(0).getReg()) + // .addImm(0); + // BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + // .addMBB(ComputeLoop); + + // I = ComputeEnd->begin(); + switch(Opc){ + case AMDGPU::S_XOR_B32:{ + // Performing an XOR operation on a uniform value + // depends on the number of active lanes. If there + // are an even number of active lanes, then the XOR + // will result in 0. And if there are an odd number + // of Active lanes then the XOR will result in the + // same value as that in the SGPR. This comes from + // the fact that A^A = 0 and A^0 = A. + + Register ParityRegister = MRI.createVirtualRegister(DstRegClass); + + auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister) + .addReg(NewAccumulator->getOperand(0).getReg()) + .addImm(1); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(ParityReg->getOperand(0).getReg()) ; + break; + } + case AMDGPU::S_SUB_I32:{ + // TODO --> use 2's compliment or subtract from 0 to find the negation of the number. + Register NegatedVal = MRI.createVirtualRegister(DstRegClass); + + // Take the negation of the source operand. + auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(InvertedValReg->getOperand(0).getReg()) + .addReg(NewAccumulator->getOperand(0).getReg()); + break; + } + case AMDGPU::S_ADD_I32:{ + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(NewAccumulator->getOperand(0).getReg()); + break; + } + } + RetBB = &BB; + } + } } else { // TODO: Implement DPP Strategy and switch based on immediate strategy // operand. For now, for all the cases (default, Iterative and DPP we use @@ -4900,9 +5031,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Create initail values of induction variable from Exec, Accumulator and - // insert branch instr to newly created ComputeBlockk - uint32_t InitalValue = - (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits::max() : 0; + // insert branch instr to newly created ComputeBlock + uint32_t InitalValue; + switch(Opc){ + case AMDGPU::S_MIN_U32: + InitalValue = std::numeric_limits::max(); + break; + case AMDGPU::S_MIN_I32: + InitalValue = std::numeric_limits::max(); + break; + case AMDGPU::S_MAX_U32: + InitalValue = 0; + break; + case AMDGPU::S_MAX_I32: + InitalValue = std::numeric_limits::min(); + break; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: + case AMDGPU::S_OR_B32: + case AMDGPU::S_XOR_B32: + InitalValue = 0x00000000; + break; + case AMDGPU::S_AND_B32: + InitalValue = 0xFFFFFFFF; + } auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) @@ -4968,10 +5120,44 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( SIMachineFunctionInfo *MFI = MF->getInfo(); switch (MI.getOpcode()) { - case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); - case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32); + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_F32); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_F32); + // case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32: + // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32); + case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); + case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_F32); + // case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32: + // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32); + case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); + case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_F32); + case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); + // case AMDGPU::WAVE_REDUCE_AND_PSEUDO_I32: + // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); + // case AMDGPU::WAVE_REDUCE_AND_PSEUDO_F32: + // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); + case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32); + // case AMDGPU::WAVE_REDUCE_OR_PSEUDO_I32: + // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32); + // case AMDGPU::WAVE_REDUCE_OR_PSEUDO_F32: + // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32); + case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32); case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { const DebugLoc &DL = MI.getDebugLoc(); @@ -6859,7 +7045,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 + // If all the operands are zero-extended to 32-bits, then we replace s_mul_u64 // TODO --> `..are zero-extended to 32-bits, then we ..` , should this be zero-extended from 32 bits? // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. KnownBits Op0KnownBits = DAG.computeKnownBits(Op0); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9afb29d95abd7..c5883ff783903 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -255,15 +255,100 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { - def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), + def WAVE_REDUCE_MIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), (ins VSrc_b32: $src, VSrc_b32:$strategy), [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> { } - def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), + def WAVE_REDUCE_MIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_MIN_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set f32:$sdst, (int_amdgcn_wave_reduce_fmin f32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_MAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), (ins VSrc_b32: $src, VSrc_b32:$strategy), [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> { } + + def WAVE_REDUCE_MAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_MAX_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set f32:$sdst, (int_amdgcn_wave_reduce_fmax f32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_ADD_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set f32:$sdst, (int_amdgcn_wave_reduce_add f32:$src, i32:$strategy))]> { + } + + //def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), + // (ins VSrc_b32: $src, VSrc_b32:$strategy), + // [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> { + //} + + def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> { + } + + //def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), + // (ins VSrc_b32: $src, VSrc_b32:$strategy), + // [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> { + //} + + def WAVE_REDUCE_SUB_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set f32:$sdst, (int_amdgcn_wave_reduce_fsub f32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_AND_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> { + } + + //def WAVE_REDUCE_AND_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + // (ins VSrc_b32: $src, VSrc_b32:$strategy), + // [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> { + //} + + //def WAVE_REDUCE_AND_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst), + // (ins VSrc_b32: $src, VSrc_b32:$strategy), + // [(set f32:$sdst, (int_amdgcn_wave_reduce_fand f32:$src, i32:$strategy))]> { + //} + + def WAVE_REDUCE_OR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> { + } + + //def WAVE_REDUCE_OR_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst), + // (ins VSrc_b32: $src, VSrc_b32:$strategy), + // [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> { + //} + + //def WAVE_REDUCE_OR_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst), + // (ins VSrc_b32: $src, VSrc_b32:$strategy), + // [(set f32:$sdst, (int_amdgcn_wave_reduce_for f32:$src, i32:$strategy))]> { + //} + + def WAVE_REDUCE_XOR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> { + } } let usesCustomInserter = 1, Defs = [VCC] in { diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll index d1e50bd560cb2..02942254cc555 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -156,7 +156,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) # ; IR-DPP: 14: ; IR-DPP-NEXT: ret void ; - %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst + %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst ret void }