@@ -1969,7 +1969,7 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
19691969 IsDivergentIdx, &Subtarget))
19701970 return false ;
19711971
1972- LLT S32 = LLT::scalar (32 );
1972+ LLT I32 = LLT::integer (32 );
19731973
19741974 const RegisterBank &DstBank =
19751975 *OpdMapper.getInstrMapping ().getOperandMapping (0 ).BreakDown [0 ].RegBank ;
@@ -1981,10 +1981,10 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
19811981 SrcBank == AMDGPU::SGPRRegBank &&
19821982 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
19831983 : AMDGPU::VCCRegBank;
1984- LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar (1 );
1984+ LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? I32 : LLT::integer (1 );
19851985
19861986 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1987- Idx = B.buildCopy (S32 , Idx)->getOperand (0 ).getReg ();
1987+ Idx = B.buildCopy (I32 , Idx)->getOperand (0 ).getReg ();
19881988 MRI.setRegBank (Idx, AMDGPU::VGPRRegBank);
19891989 }
19901990
@@ -1996,13 +1996,19 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
19961996 else
19971997 EltTy = MRI.getType (DstRegs[0 ]);
19981998
1999+ if (VecTy.isFloatVector ()) {
2000+ auto ClassOrBank = MRI.getRegClassOrRegBank (VecReg);
2001+ VecReg = B.buildBitcast ({ClassOrBank, VecTy.changeToInteger ()}, VecReg).getReg (0 );
2002+ }
2003+
19992004 auto UnmergeToEltTy = B.buildUnmerge (EltTy, VecReg);
20002005 SmallVector<Register, 2 > Res (NumLanes);
2001- for (unsigned L = 0 ; L < NumLanes; ++L)
2006+ for (unsigned L = 0 ; L < NumLanes; ++L) {
20022007 Res[L] = UnmergeToEltTy.getReg (L);
2008+ }
20032009
20042010 for (unsigned I = 1 ; I < NumElem; ++I) {
2005- auto IC = B.buildConstant (S32 , I);
2011+ auto IC = B.buildConstant (I32 , I);
20062012 MRI.setRegBank (IC->getOperand (0 ).getReg (), AMDGPU::SGPRRegBank);
20072013 auto Cmp = B.buildICmp (CmpInst::ICMP_EQ, CCTy, Idx, IC);
20082014 MRI.setRegBank (Cmp->getOperand (0 ).getReg (), CCBank);
@@ -2067,7 +2073,7 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
20672073 IsDivergentIdx, &Subtarget))
20682074 return false ;
20692075
2070- LLT S32 = LLT::scalar (32 );
2076+ LLT I32 = LLT::integer (32 );
20712077
20722078 const RegisterBank &DstBank =
20732079 *OpdMapper.getInstrMapping ().getOperandMapping (0 ).BreakDown [0 ].RegBank ;
@@ -2082,10 +2088,10 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
20822088 InsBank == AMDGPU::SGPRRegBank &&
20832089 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
20842090 : AMDGPU::VCCRegBank;
2085- LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar (1 );
2091+ LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? I32 : LLT::integer (1 );
20862092
20872093 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2088- Idx = B.buildCopy (S32 , Idx)->getOperand (0 ).getReg ();
2094+ Idx = B.buildCopy (I32 , Idx)->getOperand (0 ).getReg ();
20892095 MRI.setRegBank (Idx, AMDGPU::VGPRRegBank);
20902096 }
20912097
@@ -2099,11 +2105,17 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
20992105 EltTy = MRI.getType (InsRegs[0 ]);
21002106 }
21012107
2108+ if (VecTy.getScalarType ().isFloat () && !EltTy.isFloat ()) {
2109+ auto RegBankOrClass = MRI.getRegClassOrRegBank (VecReg);
2110+ auto CastTy = VecTy.changeToInteger ();
2111+ VecReg = B.buildBitcast ({RegBankOrClass, CastTy}, VecReg).getReg (0 );
2112+ }
2113+
21022114 auto UnmergeToEltTy = B.buildUnmerge (EltTy, VecReg);
21032115 SmallVector<Register, 16 > Ops (NumElem * NumLanes);
21042116
21052117 for (unsigned I = 0 ; I < NumElem; ++I) {
2106- auto IC = B.buildConstant (S32 , I);
2118+ auto IC = B.buildConstant (I32 , I);
21072119 MRI.setRegBank (IC->getOperand (0 ).getReg (), AMDGPU::SGPRRegBank);
21082120 auto Cmp = B.buildICmp (CmpInst::ICMP_EQ, CCTy, Idx, IC);
21092121 MRI.setRegBank (Cmp->getOperand (0 ).getReg (), CCBank);
@@ -2156,7 +2168,7 @@ void AMDGPURegisterBankInfo::applyMappingSMULU64(
21562168 MachineRegisterInfo &MRI = OpdMapper.getMRI ();
21572169 MachineInstr &MI = OpdMapper.getMI ();
21582170 Register DstReg = MI.getOperand (0 ).getReg ();
2159- LLT HalfTy = LLT::scalar (32 );
2171+ LLT HalfTy = LLT::integer (32 );
21602172
21612173 // Depending on where the source registers came from, the generic code may
21622174 // have decided to split the inputs already or not. If not, we still need to
@@ -2828,7 +2840,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
28282840 Register DstReg = MI.getOperand (0 ).getReg ();
28292841 Register SrcReg = MI.getOperand (1 ).getReg ();
28302842
2831- const LLT S32 = LLT::scalar (32 );
2843+ const LLT I32 = LLT::integer (32 );
28322844 LLT DstTy = MRI.getType (DstReg);
28332845 LLT SrcTy = MRI.getType (SrcReg);
28342846
@@ -2891,10 +2903,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
28912903
28922904 assert (DstTy.getSizeInBits () == 64 );
28932905
2894- LLT Vec32 = LLT::fixed_vector (2 * SrcTy.getNumElements (), 32 );
2906+ LLT Vec32 = LLT::fixed_vector (2 * SrcTy.getNumElements (), I32 );
28952907
28962908 auto CastSrc = B.buildBitcast (Vec32, SrcReg);
2897- auto One = B.buildConstant (S32 , 1 );
2909+ auto One = B.buildConstant (I32 , 1 );
28982910
28992911 MachineBasicBlock::iterator MII = MI.getIterator ();
29002912
@@ -2905,8 +2917,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
29052917 MachineInstrSpan Span (MII, &B.getMBB ());
29062918
29072919 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2908- auto IdxLo = B.buildShl (S32 , BaseIdxReg, One);
2909- auto IdxHi = B.buildAdd (S32 , IdxLo, One);
2920+ auto IdxLo = B.buildShl (I32 , BaseIdxReg, One);
2921+ auto IdxHi = B.buildAdd (I32 , IdxLo, One);
29102922
29112923 auto Extract0 = B.buildExtractVectorElement (DstRegs[0 ], CastSrc, IdxLo);
29122924 auto Extract1 = B.buildExtractVectorElement (DstRegs[1 ], CastSrc, IdxHi);
@@ -2932,8 +2944,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
29322944
29332945 if (NeedCopyToVGPR) {
29342946 MachineBasicBlock *LoopBB = Extract1->getParent ();
2935- Register TmpReg0 = MRI.createGenericVirtualRegister (S32 );
2936- Register TmpReg1 = MRI.createGenericVirtualRegister (S32 );
2947+ Register TmpReg0 = MRI.createGenericVirtualRegister (I32 );
2948+ Register TmpReg1 = MRI.createGenericVirtualRegister (I32 );
29372949 MRI.setRegBank (TmpReg0, AMDGPU::SGPRRegBank);
29382950 MRI.setRegBank (TmpReg1, AMDGPU::SGPRRegBank);
29392951
0 commit comments