@@ -2216,7 +2216,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22162216 assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
22172217 " unreserved scratch RSRC register" );
22182218
2219- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2219+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
22202220 int Index = MI->getOperand (FIOperandNum).getIndex ();
22212221
22222222 Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2445,7 +2445,299 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24452445 MI->eraseFromParent ();
24462446 return true ;
24472447 }
2448+ case AMDGPU::V_ADD_U32_e32:
2449+ case AMDGPU::V_ADD_U32_e64:
2450+ case AMDGPU::V_ADD_CO_U32_e32:
2451+ case AMDGPU::V_ADD_CO_U32_e64: {
2452+ // TODO: Handle sub, and, or.
2453+ unsigned NumDefs = MI->getNumExplicitDefs ();
2454+ unsigned Src0Idx = NumDefs;
2455+
2456+ bool HasClamp = false ;
2457+ MachineOperand *VCCOp = nullptr ;
2458+
2459+ switch (MI->getOpcode ()) {
2460+ case AMDGPU::V_ADD_U32_e32:
2461+ break ;
2462+ case AMDGPU::V_ADD_U32_e64:
2463+ HasClamp = MI->getOperand (3 ).getImm ();
2464+ break ;
2465+ case AMDGPU::V_ADD_CO_U32_e32:
2466+ VCCOp = &MI->getOperand (3 );
2467+ break ;
2468+ case AMDGPU::V_ADD_CO_U32_e64:
2469+ VCCOp = &MI->getOperand (1 );
2470+ HasClamp = MI->getOperand (4 ).getImm ();
2471+ break ;
2472+ default :
2473+ break ;
2474+ }
2475+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2476+ MachineOperand &DstOp = MI->getOperand (0 );
2477+ Register DstReg = DstOp.getReg ();
2478+
2479+ unsigned OtherOpIdx =
2480+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2481+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2482+
2483+ unsigned Src1Idx = Src0Idx + 1 ;
2484+ Register MaterializedReg = FrameReg;
2485+ Register ScavengedVGPR;
2486+
2487+ if (FrameReg && !ST.enableFlatScratch ()) {
2488+ // We should just do an in-place update of the result register. However,
2489+ // the value there may also be used by the add, in which case we need a
2490+ // temporary register.
2491+ //
2492+ // FIXME: The scavenger is not finding the result register in the
2493+ // common case where the add does not read the register.
2494+
2495+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2496+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2497+
2498+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2499+ // shift.
2500+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2501+ .addDef (ScavengedVGPR, RegState::Renamable)
2502+ .addImm (ST.getWavefrontSizeLog2 ())
2503+ .addReg (FrameReg);
2504+ MaterializedReg = ScavengedVGPR;
2505+ }
2506+
2507+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2508+ // For the non-immediate case, we could fall through to the default
2509+ // handling, but we do an in-place update of the result register here to
2510+ // avoid scavenging another register.
2511+ if (OtherOp->isImm ()) {
2512+ OtherOp->setImm (OtherOp->getImm () + Offset);
2513+ Offset = 0 ;
2514+ }
2515+
2516+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2517+ if (ST.enableFlatScratch () &&
2518+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2519+ // We didn't need the shift above, so we have an SGPR for the frame
2520+ // register, but may have a VGPR only operand.
2521+ //
2522+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2523+ // and use the higher constant bus restriction to avoid this copy.
2524+
2525+ if (!ScavengedVGPR) {
2526+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2527+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2528+ /* SPAdj=*/ 0 );
2529+ }
2530+
2531+ assert (ScavengedVGPR != DstReg);
2532+
2533+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2534+ .addReg (MaterializedReg,
2535+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2536+ MaterializedReg = ScavengedVGPR;
2537+ }
2538+
2539+ // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2540+ // is not live, we could use a scalar add + vector add instead of 2
2541+ // vector adds.
2542+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2543+ .addDef (DstReg, RegState::Renamable);
2544+ if (NumDefs == 2 )
2545+ AddI32.add (MI->getOperand (1 ));
2546+
2547+ unsigned MaterializedRegFlags =
2548+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2549+
2550+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2551+ // If we know we have a VGPR already, it's more likely the other
2552+ // operand is a legal vsrc0.
2553+ AddI32
2554+ .add (*OtherOp)
2555+ .addReg (MaterializedReg, MaterializedRegFlags);
2556+ } else {
2557+ // Commute operands to avoid violating VOP2 restrictions. This will
2558+ // typically happen when using scratch.
2559+ AddI32
2560+ .addReg (MaterializedReg, MaterializedRegFlags)
2561+ .add (*OtherOp);
2562+ }
2563+
2564+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2565+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2566+ AddI32.addImm (0 ); // clamp
2567+
2568+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2569+ AddI32.setOperandDead (3 ); // Dead vcc
2570+
2571+ MaterializedReg = DstReg;
2572+
2573+ OtherOp->ChangeToRegister (MaterializedReg, false );
2574+ OtherOp->setIsKill (true );
2575+ FIOp->ChangeToImmediate (Offset);
2576+ Offset = 0 ;
2577+ } else if (Offset != 0 ) {
2578+ assert (!MaterializedReg);
2579+ FIOp->ChangeToImmediate (Offset);
2580+ Offset = 0 ;
2581+ } else {
2582+ if (DeadVCC && !HasClamp) {
2583+ assert (Offset == 0 );
2584+
2585+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2586+ // let lowerCopy deal with it?
2587+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2588+ // Folded to an identity copy.
2589+ MI->eraseFromParent ();
2590+ return true ;
2591+ }
2592+
2593+ // The immediate value should be in OtherOp
2594+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2595+ MI->removeOperand (FIOperandNum);
2596+
2597+ unsigned NumOps = MI->getNumOperands ();
2598+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2599+ MI->removeOperand (I);
2600+
2601+ if (NumDefs == 2 )
2602+ MI->removeOperand (1 );
2603+
2604+ // The code below can't deal with a mov.
2605+ return true ;
2606+ }
2607+
2608+ // This folded to a constant, but we have to keep the add around for
2609+ // pointless implicit defs or clamp modifier.
2610+ FIOp->ChangeToImmediate (0 );
2611+ }
2612+
2613+ // Try to improve legality by commuting.
2614+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2615+ std::swap (FIOp, OtherOp);
2616+ std::swap (FIOperandNum, OtherOpIdx);
2617+ }
24482618
2619+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2620+ // Depending on operand constraints we may need to insert another copy.
2621+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2622+ // If commuting didn't make the operands legal, we need to materialize
2623+ // in a register.
2624+ // TODO: Can use SGPR on gfx10+ in some cases.
2625+ if (!ScavengedVGPR) {
2626+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2627+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2628+ /* SPAdj=*/ 0 );
2629+ }
2630+
2631+ assert (ScavengedVGPR != DstReg);
2632+
2633+ MachineOperand &Src = MI->getOperand (SrcIdx);
2634+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2635+ .add (Src);
2636+
2637+ Src.ChangeToRegister (ScavengedVGPR, false );
2638+ Src.setIsKill (true );
2639+ }
2640+ }
2641+
2642+ // Fold out add of 0 case that can appear in kernels.
2643+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2644+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2645+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2646+ }
2647+
2648+ MI->eraseFromParent ();
2649+ }
2650+
2651+ return true ;
2652+ }
2653+ case AMDGPU::S_ADD_I32: {
2654+ // TODO: Handle s_or_b32, s_and_b32.
2655+ unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1 ;
2656+ MachineOperand &OtherOp = MI->getOperand (OtherOpIdx);
2657+
2658+ assert (FrameReg || MFI->isBottomOfStack ());
2659+
2660+ MachineOperand &DstOp = MI->getOperand (0 );
2661+ const DebugLoc &DL = MI->getDebugLoc ();
2662+ Register MaterializedReg = FrameReg;
2663+
2664+ // Defend against live scc, which should never happen in practice.
2665+ bool DeadSCC = MI->getOperand (3 ).isDead ();
2666+
2667+ Register TmpReg;
2668+
2669+ if (FrameReg && !ST.enableFlatScratch ()) {
2670+ // FIXME: In the common case where the add does not also read its result
2671+ // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2672+ // available.
2673+ TmpReg = RS->scavengeRegisterBackwards (AMDGPU::SReg_32_XM0RegClass, MI,
2674+ false , 0 );
2675+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::S_LSHR_B32))
2676+ .addDef (TmpReg, RegState::Renamable)
2677+ .addReg (FrameReg)
2678+ .addImm (ST.getWavefrontSizeLog2 ())
2679+ .setOperandDead (3 ); // Set SCC dead
2680+ MaterializedReg = TmpReg;
2681+ }
2682+
2683+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2684+
2685+ // For the non-immediate case, we could fall through to the default
2686+ // handling, but we do an in-place update of the result register here to
2687+ // avoid scavenging another register.
2688+ if (OtherOp.isImm ()) {
2689+ OtherOp.setImm (OtherOp.getImm () + Offset);
2690+ Offset = 0 ;
2691+
2692+ if (MaterializedReg)
2693+ FIOp->ChangeToRegister (MaterializedReg, false );
2694+ else
2695+ FIOp->ChangeToImmediate (0 );
2696+ } else if (MaterializedReg) {
2697+ // If we can't fold the other operand, do another increment.
2698+ Register DstReg = DstOp.getReg ();
2699+
2700+ if (!TmpReg && MaterializedReg == FrameReg) {
2701+ TmpReg = RS->scavengeRegisterBackwards (AMDGPU::SReg_32_XM0RegClass,
2702+ MI, false , 0 );
2703+ DstReg = TmpReg;
2704+ }
2705+
2706+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::S_ADD_I32))
2707+ .addDef (DstReg, RegState::Renamable)
2708+ .addReg (MaterializedReg, RegState::Kill)
2709+ .add (OtherOp);
2710+ if (DeadSCC)
2711+ AddI32.setOperandDead (3 );
2712+
2713+ MaterializedReg = DstReg;
2714+
2715+ OtherOp.ChangeToRegister (MaterializedReg, false );
2716+ OtherOp.setIsKill (true );
2717+ OtherOp.setIsRenamable (true );
2718+ FIOp->ChangeToImmediate (Offset);
2719+ } else {
2720+ // If we don't have any other offset to apply, we can just directly
2721+ // interpret the frame index as the offset.
2722+ FIOp->ChangeToImmediate (Offset);
2723+ }
2724+
2725+ if (DeadSCC && OtherOp.isImm () && OtherOp.getImm () == 0 ) {
2726+ assert (Offset == 0 );
2727+ MI->removeOperand (3 );
2728+ MI->removeOperand (OtherOpIdx);
2729+ MI->setDesc (TII->get (FIOp->isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2730+ } else if (DeadSCC && FIOp->isImm () && FIOp->getImm () == 0 ) {
2731+ assert (Offset == 0 );
2732+ MI->removeOperand (3 );
2733+ MI->removeOperand (FIOperandNum);
2734+ MI->setDesc (
2735+ TII->get (OtherOp.isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2736+ }
2737+
2738+ assert (!FIOp->isFI ());
2739+ return true ;
2740+ }
24492741 default : {
24502742 // Other access to frame index
24512743 const DebugLoc &DL = MI->getDebugLoc ();
@@ -2459,7 +2751,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24592751
24602752 // The offset is always swizzled, just replace it
24612753 if (FrameReg)
2462- FIOp. ChangeToRegister (FrameReg, false );
2754+ FIOp-> ChangeToRegister (FrameReg, false );
24632755
24642756 MachineOperand *OffsetOp =
24652757 TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2512,18 +2804,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25122804 }
25132805
25142806 if (!FrameReg) {
2515- FIOp. ChangeToImmediate (Offset);
2516- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2807+ FIOp-> ChangeToImmediate (Offset);
2808+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
25172809 return false ;
25182810 }
25192811
25202812 // We need to use register here. Check if we can use an SGPR or need
25212813 // a VGPR.
2522- FIOp. ChangeToRegister (AMDGPU::M0, false );
2523- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2814+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2815+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
25242816
25252817 if (!Offset && FrameReg && UseSGPR) {
2526- FIOp. setReg (FrameReg);
2818+ FIOp-> setReg (FrameReg);
25272819 return false ;
25282820 }
25292821
@@ -2532,8 +2824,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25322824
25332825 Register TmpReg =
25342826 RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2535- FIOp. setReg (TmpReg);
2536- FIOp. setIsKill ();
2827+ FIOp-> setReg (TmpReg);
2828+ FIOp-> setIsKill ();
25372829
25382830 if ((!FrameReg || !Offset) && TmpReg) {
25392831 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2562,8 +2854,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25622854 if (!TmpSReg) {
25632855 // Use frame register and restore it after.
25642856 TmpSReg = FrameReg;
2565- FIOp. setReg (FrameReg);
2566- FIOp. setIsKill (false );
2857+ FIOp-> setReg (FrameReg);
2858+ FIOp-> setIsKill (false );
25672859 }
25682860
25692861 if (NeedSaveSCC) {
@@ -2802,7 +3094,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
28023094 MI->eraseFromParent ();
28033095 return true ;
28043096 }
2805- FIOp. ChangeToRegister (ResultReg, false , false , true );
3097+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
28063098 return false ;
28073099 }
28083100
@@ -2833,13 +3125,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
28333125 // If the offset is simply too big, don't convert to a scratch wave offset
28343126 // relative index.
28353127
2836- FIOp. ChangeToImmediate (Offset);
2837- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
3128+ FIOp-> ChangeToImmediate (Offset);
3129+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
28383130 Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
28393131 MI, false , 0 );
28403132 BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
28413133 .addImm (Offset);
2842- FIOp. ChangeToRegister (TmpReg, false , false , true );
3134+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
28433135 }
28443136 }
28453137 }
0 commit comments