@@ -2086,7 +2086,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
20862086 assert (MF->getRegInfo ().isReserved (MFI->getScratchRSrcReg ()) &&
20872087 " unreserved scratch RSRC register" );
20882088
2089- MachineOperand & FIOp = MI->getOperand (FIOperandNum);
2089+ MachineOperand * FIOp = & MI->getOperand (FIOperandNum);
20902090 int Index = MI->getOperand (FIOperandNum).getIndex ();
20912091
20922092 Register FrameReg = FrameInfo.isFixedObjectIndex (Index) && hasBasePointer (*MF)
@@ -2268,6 +2268,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22682268 MI->eraseFromParent ();
22692269 return true ;
22702270 }
2271+ case AMDGPU::V_ADD_U32_e32:
2272+ case AMDGPU::V_ADD_U32_e64:
2273+ case AMDGPU::V_ADD_CO_U32_e32:
2274+ case AMDGPU::V_ADD_CO_U32_e64: {
2275+ // TODO: Handle sub, and, or.
2276+ unsigned NumDefs = MI->getNumExplicitDefs ();
2277+ unsigned Src0Idx = NumDefs;
2278+
2279+ bool HasClamp = false ;
2280+ MachineOperand *VCCOp = nullptr ;
2281+
2282+ switch (MI->getOpcode ()) {
2283+ case AMDGPU::V_ADD_U32_e32:
2284+ break ;
2285+ case AMDGPU::V_ADD_U32_e64:
2286+ HasClamp = MI->getOperand (3 ).getImm ();
2287+ break ;
2288+ case AMDGPU::V_ADD_CO_U32_e32:
2289+ VCCOp = &MI->getOperand (3 );
2290+ break ;
2291+ case AMDGPU::V_ADD_CO_U32_e64:
2292+ VCCOp = &MI->getOperand (1 );
2293+ HasClamp = MI->getOperand (4 ).getImm ();
2294+ break ;
2295+ default :
2296+ break ;
2297+ }
2298+ bool DeadVCC = !VCCOp || VCCOp->isDead ();
2299+ MachineOperand &DstOp = MI->getOperand (0 );
2300+ Register DstReg = DstOp.getReg ();
2301+
2302+ unsigned OtherOpIdx =
2303+ FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2304+ MachineOperand *OtherOp = &MI->getOperand (OtherOpIdx);
2305+
2306+ unsigned Src1Idx = Src0Idx + 1 ;
2307+ Register MaterializedReg = FrameReg;
2308+ Register ScavengedVGPR;
2309+
2310+ if (FrameReg && !ST.enableFlatScratch ()) {
2311+ // We should just do an in-place update of the result register. However,
2312+ // the value there may also be used by the add, in which case we need a
2313+ // temporary register.
2314+ //
2315+ // FIXME: The scavenger is not finding the result register in the
2316+ // common case where the add does not read the register.
2317+
2318+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2319+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false , /* SPAdj=*/ 0 );
2320+
2321+ // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2322+ // shift.
2323+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_LSHRREV_B32_e64))
2324+ .addDef (ScavengedVGPR, RegState::Renamable)
2325+ .addImm (ST.getWavefrontSizeLog2 ())
2326+ .addReg (FrameReg);
2327+ MaterializedReg = ScavengedVGPR;
2328+ }
2329+
2330+ int64_t Offset = FrameInfo.getObjectOffset (Index);
2331+ // For the non-immediate case, we could fall through to the default
2332+ // handling, but we do an in-place update of the result register here to
2333+ // avoid scavenging another register.
2334+ if (OtherOp->isImm ()) {
2335+ OtherOp->setImm (OtherOp->getImm () + Offset);
2336+ Offset = 0 ;
2337+ }
2338+
2339+ if ((!OtherOp->isImm () || OtherOp->getImm () != 0 ) && MaterializedReg) {
2340+ if (ST.enableFlatScratch () &&
2341+ !TII->isOperandLegal (*MI, Src1Idx, OtherOp)) {
2342+ // We didn't need the shift above, so we have an SGPR for the frame
2343+ // register, but may have a VGPR only operand.
2344+ //
2345+ // TODO: On gfx10+, we can easily change the opcode to the e64 version
2346+ // and use the higher constant bus restriction to avoid this copy.
2347+
2348+ if (!ScavengedVGPR) {
2349+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2350+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2351+ /* SPAdj=*/ 0 );
2352+ }
2353+
2354+ assert (ScavengedVGPR != DstReg);
2355+
2356+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2357+ .addReg (MaterializedReg,
2358+ MaterializedReg != FrameReg ? RegState::Kill : 0 );
2359+ MaterializedReg = ScavengedVGPR;
2360+ }
2361+
2362+ auto AddI32 = BuildMI (*MBB, *MI, DL, TII->get (MI->getOpcode ()))
2363+ .addDef (DstReg, RegState::Renamable);
2364+ if (NumDefs == 2 )
2365+ AddI32.add (MI->getOperand (1 ));
2366+
2367+ unsigned MaterializedRegFlags =
2368+ MaterializedReg != FrameReg ? RegState::Kill : 0 ;
2369+
2370+ if (isVGPRClass (getPhysRegBaseClass (MaterializedReg))) {
2371+ // If we know we have a VGPR already, it's more likely the other
2372+ // operand is a legal vsrc0.
2373+ AddI32
2374+ .add (*OtherOp)
2375+ .addReg (MaterializedReg, MaterializedRegFlags);
2376+ } else {
2377+ // Commute operands to avoid violating VOP2 restrictions. This will
2378+ // typically happen when using scratch.
2379+ AddI32
2380+ .addReg (MaterializedReg, MaterializedRegFlags)
2381+ .add (*OtherOp);
2382+ }
2383+
2384+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e64 ||
2385+ MI->getOpcode () == AMDGPU::V_ADD_U32_e64)
2386+ AddI32.addImm (0 ); // clamp
2387+
2388+ if (MI->getOpcode () == AMDGPU::V_ADD_CO_U32_e32)
2389+ AddI32.setOperandDead (3 ); // Dead vcc
2390+
2391+ MaterializedReg = DstReg;
2392+
2393+ OtherOp->ChangeToRegister (MaterializedReg, false );
2394+ OtherOp->setIsKill (true );
2395+ FIOp->ChangeToImmediate (Offset);
2396+ Offset = 0 ;
2397+ } else if (Offset != 0 ) {
2398+ assert (!MaterializedReg);
2399+ FIOp->ChangeToImmediate (Offset);
2400+ Offset = 0 ;
2401+ } else {
2402+ if (DeadVCC && !HasClamp) {
2403+ assert (Offset == 0 );
2404+
2405+ // TODO: Losing kills and implicit operands. Just mutate to copy and
2406+ // let lowerCopy deal with it?
2407+ if (OtherOp->isReg () && OtherOp->getReg () == DstReg) {
2408+ // Folded to an identity copy.
2409+ MI->eraseFromParent ();
2410+ return true ;
2411+ }
2412+
2413+ // The immediate value should be in OtherOp
2414+ MI->setDesc (TII->get (AMDGPU::V_MOV_B32_e32));
2415+ MI->removeOperand (FIOperandNum);
2416+
2417+ unsigned NumOps = MI->getNumOperands ();
2418+ for (unsigned I = NumOps - 2 ; I >= 2 ; --I)
2419+ MI->removeOperand (I);
2420+
2421+ if (NumDefs == 2 )
2422+ MI->removeOperand (1 );
2423+
2424+ // The code below can't deal with a mov.
2425+ return true ;
2426+ }
2427+
2428+ // This folded to a constant, but we have to keep the add around for
2429+ // pointless implicit defs or clamp modifier.
2430+ FIOp->ChangeToImmediate (0 );
2431+ }
2432+
2433+ // Try to improve legality by commuting.
2434+ if (!TII->isOperandLegal (*MI, Src1Idx) && TII->commuteInstruction (*MI)) {
2435+ std::swap (FIOp, OtherOp);
2436+ std::swap (FIOperandNum, OtherOpIdx);
2437+ }
2438+
2439+ for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2440+ // Depending on operand constraints we may need to insert another copy.
2441+ if (!TII->isOperandLegal (*MI, SrcIdx)) {
2442+ // If commuting didn't make the operands legal, we need to materialize
2443+ // in a register.
2444+ // TODO: Can use SGPR on gfx10+ in some cases.
2445+ if (!ScavengedVGPR) {
2446+ ScavengedVGPR = RS->scavengeRegisterBackwards (
2447+ AMDGPU::VGPR_32RegClass, MI, /* RestoreAfter=*/ false ,
2448+ /* SPAdj=*/ 0 );
2449+ }
2450+
2451+ assert (ScavengedVGPR != DstReg);
2452+
2453+ MachineOperand &Src = MI->getOperand (SrcIdx);
2454+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2455+ .add (Src);
2456+
2457+ Src.ChangeToRegister (ScavengedVGPR, false );
2458+ Src.setIsKill (true );
2459+ }
2460+ }
2461+
2462+ // Fold out add of 0 case that can appear in kernels.
2463+ if (FIOp->isImm () && FIOp->getImm () == 0 && DeadVCC && !HasClamp) {
2464+ if (OtherOp->isReg () && OtherOp->getReg () != DstReg) {
2465+ BuildMI (*MBB, *MI, DL, TII->get (AMDGPU::COPY), DstReg).add (*OtherOp);
2466+ }
2467+
2468+ MI->eraseFromParent ();
2469+ }
2470+
2471+ return true ;
2472+ }
22712473 case AMDGPU::S_ADD_I32:
22722474 case AMDGPU::S_OR_B32:
22732475 case AMDGPU::S_AND_B32: {
@@ -2336,7 +2538,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
23362538 } else {
23372539 if (MaterializedReg)
23382540 OtherOp.ChangeToRegister (MaterializedReg, false );
2339- FIOp. ChangeToImmediate (NewOffset);
2541+ FIOp-> ChangeToImmediate (NewOffset);
23402542 }
23412543
23422544 return true ;
@@ -2354,7 +2556,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
23542556
23552557 // The offset is always swizzled, just replace it
23562558 if (FrameReg)
2357- FIOp. ChangeToRegister (FrameReg, false );
2559+ FIOp-> ChangeToRegister (FrameReg, false );
23582560
23592561 MachineOperand *OffsetOp =
23602562 TII->getNamedOperand (*MI, AMDGPU::OpName::offset);
@@ -2407,18 +2609,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24072609 }
24082610
24092611 if (!FrameReg) {
2410- FIOp. ChangeToImmediate (Offset);
2411- if (TII->isImmOperandLegal (*MI, FIOperandNum, FIOp))
2612+ FIOp-> ChangeToImmediate (Offset);
2613+ if (TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp))
24122614 return false ;
24132615 }
24142616
24152617 // We need to use register here. Check if we can use an SGPR or need
24162618 // a VGPR.
2417- FIOp. ChangeToRegister (AMDGPU::M0, false );
2418- bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, & FIOp);
2619+ FIOp-> ChangeToRegister (AMDGPU::M0, false );
2620+ bool UseSGPR = TII->isOperandLegal (*MI, FIOperandNum, FIOp);
24192621
24202622 if (!Offset && FrameReg && UseSGPR) {
2421- FIOp. setReg (FrameReg);
2623+ FIOp-> setReg (FrameReg);
24222624 return false ;
24232625 }
24242626
@@ -2427,8 +2629,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24272629
24282630 Register TmpReg =
24292631 RS->scavengeRegisterBackwards (*RC, MI, false , 0 , !UseSGPR);
2430- FIOp. setReg (TmpReg);
2431- FIOp. setIsKill ();
2632+ FIOp-> setReg (TmpReg);
2633+ FIOp-> setIsKill ();
24322634
24332635 if ((!FrameReg || !Offset) && TmpReg) {
24342636 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2457,8 +2659,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24572659 if (!TmpSReg) {
24582660 // Use frame register and restore it after.
24592661 TmpSReg = FrameReg;
2460- FIOp. setReg (FrameReg);
2461- FIOp. setIsKill (false );
2662+ FIOp-> setReg (FrameReg);
2663+ FIOp-> setIsKill (false );
24622664 }
24632665
24642666 if (NeedSaveSCC) {
@@ -2706,7 +2908,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
27062908 MI->eraseFromParent ();
27072909 return true ;
27082910 }
2709- FIOp. ChangeToRegister (ResultReg, false , false , true );
2911+ FIOp-> ChangeToRegister (ResultReg, false , false , true );
27102912 return false ;
27112913 }
27122914
@@ -2737,13 +2939,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
27372939 // If the offset is simply too big, don't convert to a scratch wave offset
27382940 // relative index.
27392941
2740- FIOp. ChangeToImmediate (Offset);
2741- if (!TII->isImmOperandLegal (*MI, FIOperandNum, FIOp)) {
2942+ FIOp-> ChangeToImmediate (Offset);
2943+ if (!TII->isImmOperandLegal (*MI, FIOperandNum, * FIOp)) {
27422944 Register TmpReg = RS->scavengeRegisterBackwards (AMDGPU::VGPR_32RegClass,
27432945 MI, false , 0 );
27442946 BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_MOV_B32_e32), TmpReg)
27452947 .addImm (Offset);
2746- FIOp. ChangeToRegister (TmpReg, false , false , true );
2948+ FIOp-> ChangeToRegister (TmpReg, false , false , true );
27472949 }
27482950 }
27492951 }
0 commit comments