@@ -2270,7 +2270,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22702270 assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
22712271 " unreserved scratch RSRC register" );
22722272
2273- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2273+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
22742274 int Index = MI->getOperand (FIOperandNum).getIndex ();
22752275
22762276 Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2452,6 +2452,211 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24522452 MI->eraseFromParent ();
24532453 return true ;
24542454 }
2455+ case AMDGPU::V_ADD_U32_e32:
2456+ case AMDGPU::V_ADD_U32_e64:
2457+ case AMDGPU::V_ADD_CO_U32_e32:
2458+ case AMDGPU::V_ADD_CO_U32_e64: {
2459+ // TODO: Handle sub, and, or.
2460+ unsigned NumDefs = MI->getNumExplicitDefs ();
2461+ unsigned Src0Idx = NumDefs;
2462+
2463+ bool HasClamp = false ;
2464+ MachineOperand *VCCOp = nullptr ;
2465+
2466+ switch (MI->getOpcode ()) {
2467+ case AMDGPU::V_ADD_U32_e32:
2468+ break ;
2469+ case AMDGPU::V_ADD_U32_e64:
2470+ HasClamp = MI->getOperand (3 ).getImm ();
2471+ break ;
2472+ case AMDGPU::V_ADD_CO_U32_e32:
2473+ VCCOp = &MI->getOperand (3 );
2474+ break ;
2475+ case AMDGPU::V_ADD_CO_U32_e64:
2476+ VCCOp = &MI->getOperand (1 );
2477+ HasClamp = MI->getOperand (4 ).getImm ();
2478+ break ;
2479+ default :
2480+ break ;
2481+ }
2482+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2483+ MachineOperand &DstOp = MI->getOperand (0 );
2484+ Register DstReg = DstOp.getReg ();
2485+
2486+ unsigned OtherOpIdx =
2487+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2488+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2489+
2490+ unsigned Src1Idx = Src0Idx + 1 ;
2491+ Register MaterializedReg = FrameReg;
2492+ Register ScavengedVGPR;
2493+
2494+ if (FrameReg && !ST.enableFlatScratch ()) {
2495+ // We should just do an in-place update of the result register. However,
2496+ // the value there may also be used by the add, in which case we need a
2497+ // temporary register.
2498+ //
2499+ // FIXME: The scavenger is not finding the result register in the
2500+ // common case where the add does not read the register.
2501+
2502+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2503+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2504+
2505+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2506+ // shift.
2507+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2508+ .addDef (ScavengedVGPR, RegState::Renamable)
2509+ .addImm (ST.getWavefrontSizeLog2 ())
2510+ .addReg (FrameReg);
2511+ MaterializedReg = ScavengedVGPR;
2512+ }
2513+
2514+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2515+ // For the non-immediate case, we could fall through to the default
2516+ // handling, but we do an in-place update of the result register here to
2517+ // avoid scavenging another register.
2518+ if (OtherOp->isImm ()) {
2519+ OtherOp->setImm (OtherOp->getImm () + Offset);
2520+ Offset = 0 ;
2521+ }
2522+
2523+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2524+ if (ST.enableFlatScratch () &&
2525+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2526+ // We didn't need the shift above, so we have an SGPR for the frame
2527+ // register, but may have a VGPR only operand.
2528+ //
2529+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2530+ // and use the higher constant bus restriction to avoid this copy.
2531+
2532+ if (!ScavengedVGPR) {
2533+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2534+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2535+ /* SPAdj=*/ 0 );
2536+ }
2537+
2538+ assert (ScavengedVGPR != DstReg);
2539+
2540+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2541+ .addReg (MaterializedReg,
2542+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2543+ MaterializedReg = ScavengedVGPR;
2544+ }
2545+
2546+ // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2547+ // is not live, we could use a scalar add + vector add instead of 2
2548+ // vector adds.
2549+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2550+ .addDef (DstReg, RegState::Renamable);
2551+ if (NumDefs == 2 )
2552+ AddI32.add (MI->getOperand (1 ));
2553+
2554+ unsigned MaterializedRegFlags =
2555+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2556+
2557+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2558+ // If we know we have a VGPR already, it's more likely the other
2559+ // operand is a legal vsrc0.
2560+ AddI32
2561+ .add (*OtherOp)
2562+ .addReg (MaterializedReg, MaterializedRegFlags);
2563+ } else {
2564+ // Commute operands to avoid violating VOP2 restrictions. This will
2565+ // typically happen when using scratch.
2566+ AddI32
2567+ .addReg (MaterializedReg, MaterializedRegFlags)
2568+ .add (*OtherOp);
2569+ }
2570+
2571+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2572+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2573+ AddI32.addImm (0 ); // clamp
2574+
2575+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2576+ AddI32.setOperandDead (3 ); // Dead vcc
2577+
2578+ MaterializedReg = DstReg;
2579+
2580+ OtherOp->ChangeToRegister (MaterializedReg, false );
2581+ OtherOp->setIsKill (true );
2582+ FIOp->ChangeToImmediate (Offset);
2583+ Offset = 0 ;
2584+ } else if (Offset != 0 ) {
2585+ assert (!MaterializedReg);
2586+ FIOp->ChangeToImmediate (Offset);
2587+ Offset = 0 ;
2588+ } else {
2589+ if (DeadVCC && !HasClamp) {
2590+ assert (Offset == 0 );
2591+
2592+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2593+ // let lowerCopy deal with it?
2594+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2595+ // Folded to an identity copy.
2596+ MI->eraseFromParent ();
2597+ return true ;
2598+ }
2599+
2600+ // The immediate value should be in OtherOp
2601+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2602+ MI->removeOperand (FIOperandNum);
2603+
2604+ unsigned NumOps = MI->getNumOperands ();
2605+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2606+ MI->removeOperand (I);
2607+
2608+ if (NumDefs == 2 )
2609+ MI->removeOperand (1 );
2610+
2611+ // The code below can't deal with a mov.
2612+ return true ;
2613+ }
2614+
2615+ // This folded to a constant, but we have to keep the add around for
2616+ // pointless implicit defs or clamp modifier.
2617+ FIOp->ChangeToImmediate (0 );
2618+ }
2619+
2620+ // Try to improve legality by commuting.
2621+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2622+ std::swap (FIOp, OtherOp);
2623+ std::swap (FIOperandNum, OtherOpIdx);
2624+ }
2625+
2626+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2627+ // Depending on operand constraints we may need to insert another copy.
2628+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2629+ // If commuting didn't make the operands legal, we need to materialize
2630+ // in a register.
2631+ // TODO: Can use SGPR on gfx10+ in some cases.
2632+ if (!ScavengedVGPR) {
2633+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2634+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2635+ /* SPAdj=*/ 0 );
2636+ }
2637+
2638+ assert (ScavengedVGPR != DstReg);
2639+
2640+ MachineOperand &Src = MI->getOperand (SrcIdx);
2641+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2642+ .add (Src);
2643+
2644+ Src.ChangeToRegister (ScavengedVGPR, false );
2645+ Src.setIsKill (true );
2646+ }
2647+ }
2648+
2649+ // Fold out add of 0 case that can appear in kernels.
2650+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2651+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2652+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2653+ }
2654+
2655+ MI->eraseFromParent ();
2656+ }
2657+
2658+ return true ;
2659+ }
24552660 case AMDGPU::S_ADD_I32: {
24562661 // TODO: Handle s_or_b32, s_and_b32.
24572662 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1 ;
@@ -2492,9 +2697,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24922697 Offset = 0 ;
24932698
24942699 if (MaterializedReg)
2495- FIOp. ChangeToRegister (MaterializedReg, false );
2700+ FIOp-> ChangeToRegister (MaterializedReg, false );
24962701 else
2497- FIOp. ChangeToImmediate (0 );
2702+ FIOp-> ChangeToImmediate (0 );
24982703 } else if (MaterializedReg) {
24992704 // If we can't fold the other operand, do another increment.
25002705 Register DstReg = DstOp.getReg ();
@@ -2517,27 +2722,27 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25172722 OtherOp.ChangeToRegister (MaterializedReg, false );
25182723 OtherOp.setIsKill (true );
25192724 OtherOp.setIsRenamable (true );
2520- FIOp. ChangeToImmediate (Offset);
2725+ FIOp-> ChangeToImmediate (Offset);
25212726 } else {
25222727 // If we don't have any other offset to apply, we can just directly
25232728 // interpret the frame index as the offset.
2524- FIOp. ChangeToImmediate (Offset);
2729+ FIOp-> ChangeToImmediate (Offset);
25252730 }
25262731
25272732 if (DeadSCC && OtherOp.isImm () && OtherOp.getImm () == 0 ) {
25282733 assert (Offset == 0 );
25292734 MI->removeOperand (3 );
25302735 MI->removeOperand (OtherOpIdx);
2531- MI->setDesc (TII->get (FIOp. isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2532- } else if (DeadSCC && FIOp. isImm () && FIOp. getImm () == 0 ) {
2736+ MI->setDesc (TII->get (FIOp-> isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2737+ } else if (DeadSCC && FIOp-> isImm () && FIOp-> getImm () == 0 ) {
25332738 assert (Offset == 0 );
25342739 MI->removeOperand (3 );
25352740 MI->removeOperand (FIOperandNum);
25362741 MI->setDesc (
25372742 TII->get (OtherOp.isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
25382743 }
25392744
2540- assert (!FIOp. isFI ());
2745+ assert (!FIOp-> isFI ());
25412746 return true ;
25422747 }
25432748 default : {
@@ -2553,7 +2758,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25532758
25542759 // The offset is always swizzled, just replace it
25552760 if (FrameReg)
2556- FIOp. ChangeToRegister (FrameReg, false );
2761+ FIOp-> ChangeToRegister (FrameReg, false );
25572762
25582763 MachineOperand *OffsetOp =
25592764 TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2606,18 +2811,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26062811 }
26072812
26082813 if (!FrameReg) {
2609- FIOp. ChangeToImmediate (Offset);
2610- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2814+ FIOp-> ChangeToImmediate (Offset);
2815+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
26112816 return false ;
26122817 }
26132818
26142819 // We need to use register here. Check if we can use an SGPR or need
26152820 // a VGPR.
2616- FIOp. ChangeToRegister (AMDGPU::M0, false );
2617- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2821+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2822+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
26182823
26192824 if (!Offset && FrameReg && UseSGPR) {
2620- FIOp. setReg (FrameReg);
2825+ FIOp-> setReg (FrameReg);
26212826 return false ;
26222827 }
26232828
@@ -2626,8 +2831,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26262831
26272832 Register TmpReg =
26282833 RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2629- FIOp. setReg (TmpReg);
2630- FIOp. setIsKill ();
2834+ FIOp-> setReg (TmpReg);
2835+ FIOp-> setIsKill ();
26312836
26322837 if ((!FrameReg || !Offset) && TmpReg) {
26332838 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2656,8 +2861,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26562861 if (!TmpSReg) {
26572862 // Use frame register and restore it after.
26582863 TmpSReg = FrameReg;
2659- FIOp. setReg (FrameReg);
2660- FIOp. setIsKill (false );
2864+ FIOp-> setReg (FrameReg);
2865+ FIOp-> setIsKill (false );
26612866 }
26622867
26632868 if (NeedSaveSCC) {
@@ -2905,7 +3110,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29053110 MI->eraseFromParent ();
29063111 return true ;
29073112 }
2908- FIOp. ChangeToRegister (ResultReg, false , false , true );
3113+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
29093114 return false ;
29103115 }
29113116
@@ -2936,13 +3141,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29363141 // If the offset is simply too big, don't convert to a scratch wave offset
29373142 // relative index.
29383143
2939- FIOp. ChangeToImmediate (Offset);
2940- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
3144+ FIOp-> ChangeToImmediate (Offset);
3145+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
29413146 Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
29423147 MI, false , 0 );
29433148 BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
29443149 .addImm (Offset);
2945- FIOp. ChangeToRegister (TmpReg, false , false , true );
3150+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
29463151 }
29473152 }
29483153 }
0 commit comments