Skip to content

Commit 9ce8d21

Browse files
committed
Convert more 64-bit lshr to 32-bit if shift amt>=32
Signed-off-by: John Lu <[email protected]>
1 parent 132f786 commit 9ce8d21

File tree

3 files changed

+687
-84
lines changed

3 files changed

+687
-84
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 96 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4176,50 +4176,110 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
41764176

41774177
SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
41784178
DAGCombinerInfo &DCI) const {
4179-
auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4180-
if (!RHS)
4181-
return SDValue();
4182-
4179+
SDValue RHS = N->getOperand(1);
4180+
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
41834181
EVT VT = N->getValueType(0);
41844182
SDValue LHS = N->getOperand(0);
4185-
unsigned ShiftAmt = RHS->getZExtValue();
41864183
SelectionDAG &DAG = DCI.DAG;
41874184
SDLoc SL(N);
4185+
unsigned RHSVal;
4186+
4187+
if (CRHS) {
4188+
RHSVal = CRHS->getZExtValue();
41884189

4189-
// fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4190-
// this improves the ability to match BFE patterns in isel.
4191-
if (LHS.getOpcode() == ISD::AND) {
4192-
if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4193-
unsigned MaskIdx, MaskLen;
4194-
if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4195-
MaskIdx == ShiftAmt) {
4196-
return DAG.getNode(
4197-
ISD::AND, SL, VT,
4198-
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4199-
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4190+
// fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4191+
// this improves the ability to match BFE patterns in isel.
4192+
if (LHS.getOpcode() == ISD::AND) {
4193+
if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4194+
unsigned MaskIdx, MaskLen;
4195+
if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4196+
MaskIdx == RHSVal) {
4197+
return DAG.getNode(ISD::AND, SL, VT,
4198+
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4199+
N->getOperand(1)),
4200+
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4201+
N->getOperand(1)));
4202+
}
42004203
}
42014204
}
42024205
}
42034206

4204-
if (VT != MVT::i64)
4207+
// If the shift is exact, the shifted out bits matter.
4208+
if (N->getFlags().hasExact())
42054209
return SDValue();
42064210

4207-
if (ShiftAmt < 32)
4211+
if (VT.getScalarType() != MVT::i64)
42084212
return SDValue();
42094213

4210-
// srl i64:x, C for C >= 32
4211-
// =>
4212-
// build_pair (srl hi_32(x), C - 32), 0
4213-
SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4214+
// for C >= 32
4215+
// i64 (srl x, C) -> (build_pair (srl hi_32(x), C -32), 0)
42144216

4215-
SDValue Hi = getHiHalf64(LHS, DAG);
4217+
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4218+
// common case, splitting this into a move and a 32-bit shift is faster and
4219+
// the same code size.
4220+
KnownBits Known = DAG.computeKnownBits(RHS);
42164221

4217-
SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4218-
SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4222+
EVT ElementType = VT.getScalarType();
4223+
EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4224+
EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4225+
: TargetScalarType;
42194226

4220-
SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4227+
if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4228+
return SDValue();
42214229

4222-
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4230+
SDValue ShiftAmt;
4231+
if (CRHS) {
4232+
ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4233+
TargetType);
4234+
} else {
4235+
SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4236+
const SDValue ShiftMask =
4237+
DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4238+
// This AND instruction will clamp out of bounds shift values.
4239+
// It will also be removed during later instruction selection.
4240+
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
4241+
}
4242+
4243+
const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4244+
EVT ConcatType;
4245+
SDValue Hi;
4246+
SDLoc LHSSL(LHS);
4247+
// Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4248+
if (VT.isVector()) {
4249+
unsigned NElts = TargetType.getVectorNumElements();
4250+
ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4251+
SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4252+
SmallVector<SDValue, 8> HiOps(NElts);
4253+
SmallVector<SDValue, 16> HiAndLoOps;
4254+
4255+
DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4256+
for (unsigned I = 0; I != NElts; ++I) {
4257+
HiOps[I] = HiAndLoOps[2 * I + 1];
4258+
}
4259+
Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4260+
} else {
4261+
const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4262+
ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4263+
SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4264+
Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4265+
}
4266+
4267+
SDValue NewShift = DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt);
4268+
4269+
SDValue Vec;
4270+
if (VT.isVector()) {
4271+
unsigned NElts = TargetType.getVectorNumElements();
4272+
SmallVector<SDValue, 8> LoOps;
4273+
SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4274+
4275+
DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4276+
for (unsigned I = 0; I != NElts; ++I)
4277+
HiAndLoOps[2 * I] = LoOps[I];
4278+
Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4279+
} else {
4280+
Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4281+
}
4282+
return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
42234283
}
42244284

42254285
SDValue AMDGPUTargetLowering::performTruncateCombine(
@@ -5198,22 +5258,21 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
51985258

51995259
break;
52005260
}
5201-
case ISD::SHL: {
5261+
case ISD::SHL:
5262+
case ISD::SRL: {
52025263
// Range metadata can be invalidated when loads are converted to legal types
52035264
// (e.g. v2i64 -> v4i32).
5204-
// Try to convert vector shl before type legalization so that range metadata
5205-
// can be utilized.
5265+
// Try to convert vector shl/srl before type legalization so that range
5266+
// metadata can be utilized.
52065267
if (!(N->getValueType(0).isVector() &&
52075268
DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
52085269
DCI.getDAGCombineLevel() < AfterLegalizeDAG)
52095270
break;
5210-
return performShlCombine(N, DCI);
5211-
}
5212-
case ISD::SRL: {
5213-
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5214-
break;
5215-
5216-
return performSrlCombine(N, DCI);
5271+
if (N->getOpcode() == ISD::SHL) {
5272+
return performShlCombine(N, DCI);
5273+
} else {
5274+
return performSrlCombine(N, DCI);
5275+
}
52175276
}
52185277
case ISD::SRA: {
52195278
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)

llvm/test/CodeGen/AMDGPU/mad_64_32.ll

Lines changed: 30 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1947,16 +1947,14 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
19471947
; CI-LABEL: lshr_mad_i64_vec:
19481948
; CI: ; %bb.0:
19491949
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1950-
; CI-NEXT: v_mov_b32_e32 v6, v3
1951-
; CI-NEXT: v_mov_b32_e32 v3, v1
1952-
; CI-NEXT: v_mov_b32_e32 v1, 0
19531950
; CI-NEXT: s_mov_b32 s4, 0xffff1c18
1954-
; CI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
1955-
; CI-NEXT: v_mov_b32_e32 v3, v1
1951+
; CI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
19561952
; CI-NEXT: s_mov_b32 s4, 0xffff1118
1957-
; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
1953+
; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
1954+
; CI-NEXT: v_sub_i32_e32 v1, vcc, v5, v1
1955+
; CI-NEXT: v_sub_i32_e32 v3, vcc, v7, v3
19581956
; CI-NEXT: v_mov_b32_e32 v0, v4
1959-
; CI-NEXT: v_mov_b32_e32 v1, v5
1957+
; CI-NEXT: v_mov_b32_e32 v2, v6
19601958
; CI-NEXT: s_setpc_b64 s[30:31]
19611959
;
19621960
; SI-LABEL: lshr_mad_i64_vec:
@@ -1979,44 +1977,28 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
19791977
; GFX9-LABEL: lshr_mad_i64_vec:
19801978
; GFX9: ; %bb.0:
19811979
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1982-
; GFX9-NEXT: v_mov_b32_e32 v6, v3
1983-
; GFX9-NEXT: v_mov_b32_e32 v3, v1
1984-
; GFX9-NEXT: v_mov_b32_e32 v1, 0
19851980
; GFX9-NEXT: s_mov_b32 s4, 0xffff1c18
1986-
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
1987-
; GFX9-NEXT: v_mov_b32_e32 v3, v1
1981+
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
19881982
; GFX9-NEXT: s_mov_b32 s4, 0xffff1118
1989-
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
1983+
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
1984+
; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1
1985+
; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3
19901986
; GFX9-NEXT: v_mov_b32_e32 v0, v4
1991-
; GFX9-NEXT: v_mov_b32_e32 v1, v5
1987+
; GFX9-NEXT: v_mov_b32_e32 v2, v6
19921988
; GFX9-NEXT: s_setpc_b64 s[30:31]
19931989
;
1994-
; GFX1100-LABEL: lshr_mad_i64_vec:
1995-
; GFX1100: ; %bb.0:
1996-
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1997-
; GFX1100-NEXT: v_mov_b32_e32 v8, v3
1998-
; GFX1100-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v1, 0
1999-
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2000-
; GFX1100-NEXT: v_mad_u64_u32 v[4:5], null, 0xffff1c18, v6, v[0:1]
2001-
; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, v4
2002-
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2003-
; GFX1100-NEXT: v_mad_u64_u32 v[6:7], null, 0xffff1118, v8, v[2:3]
2004-
; GFX1100-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
2005-
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
2006-
; GFX1100-NEXT: v_mov_b32_e32 v3, v7
2007-
; GFX1100-NEXT: s_setpc_b64 s[30:31]
2008-
;
2009-
; GFX1150-LABEL: lshr_mad_i64_vec:
2010-
; GFX1150: ; %bb.0:
2011-
; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2012-
; GFX1150-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
2013-
; GFX1150-NEXT: v_mov_b32_e32 v1, 0
2014-
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2015-
; GFX1150-NEXT: v_mov_b32_e32 v3, v1
2016-
; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
2017-
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2)
2018-
; GFX1150-NEXT: v_mad_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
2019-
; GFX1150-NEXT: s_setpc_b64 s[30:31]
1990+
; GFX11-LABEL: lshr_mad_i64_vec:
1991+
; GFX11: ; %bb.0:
1992+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1993+
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
1994+
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
1995+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1996+
; GFX11-NEXT: v_sub_nc_u32_e32 v1, v5, v1
1997+
; GFX11-NEXT: v_mov_b32_e32 v0, v4
1998+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
1999+
; GFX11-NEXT: v_sub_nc_u32_e32 v3, v7, v3
2000+
; GFX11-NEXT: v_mov_b32_e32 v2, v6
2001+
; GFX11-NEXT: s_setpc_b64 s[30:31]
20202002
;
20212003
; GFX12-LABEL: lshr_mad_i64_vec:
20222004
; GFX12: ; %bb.0:
@@ -2025,13 +2007,14 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
20252007
; GFX12-NEXT: s_wait_samplecnt 0x0
20262008
; GFX12-NEXT: s_wait_bvhcnt 0x0
20272009
; GFX12-NEXT: s_wait_kmcnt 0x0
2028-
; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
2029-
; GFX12-NEXT: v_mov_b32_e32 v1, 0
2030-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2031-
; GFX12-NEXT: v_mov_b32_e32 v3, v1
2032-
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
2033-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
2034-
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
2010+
; GFX12-NEXT: v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
2011+
; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
2012+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
2013+
; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1
2014+
; GFX12-NEXT: v_mov_b32_e32 v0, v4
2015+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
2016+
; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3
2017+
; GFX12-NEXT: v_mov_b32_e32 v2, v6
20352018
; GFX12-NEXT: s_setpc_b64 s[30:31]
20362019
%lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
20372020
%mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>

0 commit comments

Comments
 (0)