@@ -2250,7 +2250,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22502250 assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
22512251 " unreserved scratch RSRC register" );
22522252
2253- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2253+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
22542254 int Index = MI->getOperand (FIOperandNum).getIndex ();
22552255
22562256 Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2432,6 +2432,211 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24322432 MI->eraseFromParent ();
24332433 return true ;
24342434 }
2435+ case AMDGPU::V_ADD_U32_e32:
2436+ case AMDGPU::V_ADD_U32_e64:
2437+ case AMDGPU::V_ADD_CO_U32_e32:
2438+ case AMDGPU::V_ADD_CO_U32_e64: {
2439+ // TODO: Handle sub, and, or.
2440+ unsigned NumDefs = MI->getNumExplicitDefs ();
2441+ unsigned Src0Idx = NumDefs;
2442+
2443+ bool HasClamp = false ;
2444+ MachineOperand *VCCOp = nullptr ;
2445+
2446+ switch (MI->getOpcode ()) {
2447+ case AMDGPU::V_ADD_U32_e32:
2448+ break ;
2449+ case AMDGPU::V_ADD_U32_e64:
2450+ HasClamp = MI->getOperand (3 ).getImm ();
2451+ break ;
2452+ case AMDGPU::V_ADD_CO_U32_e32:
2453+ VCCOp = &MI->getOperand (3 );
2454+ break ;
2455+ case AMDGPU::V_ADD_CO_U32_e64:
2456+ VCCOp = &MI->getOperand (1 );
2457+ HasClamp = MI->getOperand (4 ).getImm ();
2458+ break ;
2459+ default :
2460+ break ;
2461+ }
2462+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2463+ MachineOperand &DstOp = MI->getOperand (0 );
2464+ Register DstReg = DstOp.getReg ();
2465+
2466+ unsigned OtherOpIdx =
2467+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2468+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2469+
2470+ unsigned Src1Idx = Src0Idx + 1 ;
2471+ Register MaterializedReg = FrameReg;
2472+ Register ScavengedVGPR;
2473+
2474+ if (FrameReg && !ST.enableFlatScratch ()) {
2475+ // We should just do an in-place update of the result register. However,
2476+ // the value there may also be used by the add, in which case we need a
2477+ // temporary register.
2478+ //
2479+ // FIXME: The scavenger is not finding the result register in the
2480+ // common case where the add does not read the register.
2481+
2482+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2483+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2484+
2485+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2486+ // shift.
2487+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2488+ .addDef (ScavengedVGPR, RegState::Renamable)
2489+ .addImm (ST.getWavefrontSizeLog2 ())
2490+ .addReg (FrameReg);
2491+ MaterializedReg = ScavengedVGPR;
2492+ }
2493+
2494+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2495+ // For the non-immediate case, we could fall through to the default
2496+ // handling, but we do an in-place update of the result register here to
2497+ // avoid scavenging another register.
2498+ if (OtherOp->isImm ()) {
2499+ OtherOp->setImm (OtherOp->getImm () + Offset);
2500+ Offset = 0 ;
2501+ }
2502+
2503+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2504+ if (ST.enableFlatScratch () &&
2505+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2506+ // We didn't need the shift above, so we have an SGPR for the frame
2507+ // register, but may have a VGPR only operand.
2508+ //
2509+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2510+ // and use the higher constant bus restriction to avoid this copy.
2511+
2512+ if (!ScavengedVGPR) {
2513+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2514+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2515+ /* SPAdj=*/ 0 );
2516+ }
2517+
2518+ assert (ScavengedVGPR != DstReg);
2519+
2520+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2521+ .addReg (MaterializedReg,
2522+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2523+ MaterializedReg = ScavengedVGPR;
2524+ }
2525+
2526+ // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2527+ // is not live, we could use a scalar add + vector add instead of 2
2528+ // vector adds.
2529+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2530+ .addDef (DstReg, RegState::Renamable);
2531+ if (NumDefs == 2 )
2532+ AddI32.add (MI->getOperand (1 ));
2533+
2534+ unsigned MaterializedRegFlags =
2535+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2536+
2537+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2538+ // If we know we have a VGPR already, it's more likely the other
2539+ // operand is a legal vsrc0.
2540+ AddI32
2541+ .add (*OtherOp)
2542+ .addReg (MaterializedReg, MaterializedRegFlags);
2543+ } else {
2544+ // Commute operands to avoid violating VOP2 restrictions. This will
2545+ // typically happen when using scratch.
2546+ AddI32
2547+ .addReg (MaterializedReg, MaterializedRegFlags)
2548+ .add (*OtherOp);
2549+ }
2550+
2551+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2552+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2553+ AddI32.addImm (0 ); // clamp
2554+
2555+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2556+ AddI32.setOperandDead (3 ); // Dead vcc
2557+
2558+ MaterializedReg = DstReg;
2559+
2560+ OtherOp->ChangeToRegister (MaterializedReg, false );
2561+ OtherOp->setIsKill (true );
2562+ FIOp->ChangeToImmediate (Offset);
2563+ Offset = 0 ;
2564+ } else if (Offset != 0 ) {
2565+ assert (!MaterializedReg);
2566+ FIOp->ChangeToImmediate (Offset);
2567+ Offset = 0 ;
2568+ } else {
2569+ if (DeadVCC && !HasClamp) {
2570+ assert (Offset == 0 );
2571+
2572+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2573+ // let lowerCopy deal with it?
2574+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2575+ // Folded to an identity copy.
2576+ MI->eraseFromParent ();
2577+ return true ;
2578+ }
2579+
2580+ // The immediate value should be in OtherOp
2581+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2582+ MI->removeOperand (FIOperandNum);
2583+
2584+ unsigned NumOps = MI->getNumOperands ();
2585+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2586+ MI->removeOperand (I);
2587+
2588+ if (NumDefs == 2 )
2589+ MI->removeOperand (1 );
2590+
2591+ // The code below can't deal with a mov.
2592+ return true ;
2593+ }
2594+
2595+ // This folded to a constant, but we have to keep the add around for
2596+ // pointless implicit defs or clamp modifier.
2597+ FIOp->ChangeToImmediate (0 );
2598+ }
2599+
2600+ // Try to improve legality by commuting.
2601+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2602+ std::swap (FIOp, OtherOp);
2603+ std::swap (FIOperandNum, OtherOpIdx);
2604+ }
2605+
2606+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2607+ // Depending on operand constraints we may need to insert another copy.
2608+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2609+ // If commuting didn't make the operands legal, we need to materialize
2610+ // in a register.
2611+ // TODO: Can use SGPR on gfx10+ in some cases.
2612+ if (!ScavengedVGPR) {
2613+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2614+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2615+ /* SPAdj=*/ 0 );
2616+ }
2617+
2618+ assert (ScavengedVGPR != DstReg);
2619+
2620+ MachineOperand &Src = MI->getOperand (SrcIdx);
2621+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2622+ .add (Src);
2623+
2624+ Src.ChangeToRegister (ScavengedVGPR, false );
2625+ Src.setIsKill (true );
2626+ }
2627+ }
2628+
2629+ // Fold out add of 0 case that can appear in kernels.
2630+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2631+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2632+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2633+ }
2634+
2635+ MI->eraseFromParent ();
2636+ }
2637+
2638+ return true ;
2639+ }
24352640 case AMDGPU::S_ADD_I32: {
24362641 // TODO: Handle s_or_b32, s_and_b32.
24372642 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1 ;
@@ -2472,9 +2677,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24722677 Offset = 0 ;
24732678
24742679 if (MaterializedReg)
2475- FIOp. ChangeToRegister (MaterializedReg, false );
2680+ FIOp-> ChangeToRegister (MaterializedReg, false );
24762681 else
2477- FIOp. ChangeToImmediate (0 );
2682+ FIOp-> ChangeToImmediate (0 );
24782683 } else if (MaterializedReg) {
24792684 // If we can't fold the other operand, do another increment.
24802685 Register DstReg = DstOp.getReg ();
@@ -2497,27 +2702,27 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24972702 OtherOp.ChangeToRegister (MaterializedReg, false );
24982703 OtherOp.setIsKill (true );
24992704 OtherOp.setIsRenamable (true );
2500- FIOp. ChangeToImmediate (Offset);
2705+ FIOp-> ChangeToImmediate (Offset);
25012706 } else {
25022707 // If we don't have any other offset to apply, we can just directly
25032708 // interpret the frame index as the offset.
2504- FIOp. ChangeToImmediate (Offset);
2709+ FIOp-> ChangeToImmediate (Offset);
25052710 }
25062711
25072712 if (DeadSCC && OtherOp.isImm () && OtherOp.getImm () == 0 ) {
25082713 assert (Offset == 0 );
25092714 MI->removeOperand (3 );
25102715 MI->removeOperand (OtherOpIdx);
2511- MI->setDesc (TII->get (FIOp. isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2512- } else if (DeadSCC && FIOp. isImm () && FIOp. getImm () == 0 ) {
2716+ MI->setDesc (TII->get (FIOp-> isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2717+ } else if (DeadSCC && FIOp-> isImm () && FIOp-> getImm () == 0 ) {
25132718 assert (Offset == 0 );
25142719 MI->removeOperand (3 );
25152720 MI->removeOperand (FIOperandNum);
25162721 MI->setDesc (
25172722 TII->get (OtherOp.isReg () ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
25182723 }
25192724
2520- assert (!FIOp. isFI ());
2725+ assert (!FIOp-> isFI ());
25212726 return true ;
25222727 }
25232728 default : {
@@ -2533,7 +2738,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25332738
25342739 // The offset is always swizzled, just replace it
25352740 if (FrameReg)
2536- FIOp. ChangeToRegister (FrameReg, false );
2741+ FIOp-> ChangeToRegister (FrameReg, false );
25372742
25382743 MachineOperand *OffsetOp =
25392744 TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2586,18 +2791,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25862791 }
25872792
25882793 if (!FrameReg) {
2589- FIOp. ChangeToImmediate (Offset);
2590- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2794+ FIOp-> ChangeToImmediate (Offset);
2795+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
25912796 return false ;
25922797 }
25932798
25942799 // We need to use register here. Check if we can use an SGPR or need
25952800 // a VGPR.
2596- FIOp. ChangeToRegister (AMDGPU::M0, false );
2597- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2801+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2802+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
25982803
25992804 if (!Offset && FrameReg && UseSGPR) {
2600- FIOp. setReg (FrameReg);
2805+ FIOp-> setReg (FrameReg);
26012806 return false ;
26022807 }
26032808
@@ -2606,8 +2811,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26062811
26072812 Register TmpReg =
26082813 RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2609- FIOp. setReg (TmpReg);
2610- FIOp. setIsKill ();
2814+ FIOp-> setReg (TmpReg);
2815+ FIOp-> setIsKill ();
26112816
26122817 if ((!FrameReg || !Offset) && TmpReg) {
26132818 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2636,8 +2841,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
26362841 if (!TmpSReg) {
26372842 // Use frame register and restore it after.
26382843 TmpSReg = FrameReg;
2639- FIOp. setReg (FrameReg);
2640- FIOp. setIsKill (false );
2844+ FIOp-> setReg (FrameReg);
2845+ FIOp-> setIsKill (false );
26412846 }
26422847
26432848 if (NeedSaveSCC) {
@@ -2885,7 +3090,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
28853090 MI->eraseFromParent ();
28863091 return true ;
28873092 }
2888- FIOp. ChangeToRegister (ResultReg, false , false , true );
3093+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
28893094 return false ;
28903095 }
28913096
@@ -2916,13 +3121,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
29163121 // If the offset is simply too big, don't convert to a scratch wave offset
29173122 // relative index.
29183123
2919- FIOp. ChangeToImmediate (Offset);
2920- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
3124+ FIOp-> ChangeToImmediate (Offset);
3125+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
29213126 Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
29223127 MI, false , 0 );
29233128 BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
29243129 .addImm (Offset);
2925- FIOp. ChangeToRegister (TmpReg, false , false , true );
3130+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
29263131 }
29273132 }
29283133 }
0 commit comments