Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 92 additions & 39 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4097,7 +4097,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
if (VT.getScalarType() != MVT::i64)
return SDValue();

// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))

// On some subtargets, 64-bit shift is a quarter rate instruction. In the
// common case, splitting this into a move and a 32-bit shift is faster and
Expand All @@ -4117,12 +4117,12 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
TargetType);
} else {
SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
const SDValue ShiftMask =
DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
// This AND instruction will clamp out of bounds shift values.
// It will also be removed during later instruction selection.
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
}

SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
Expand Down Expand Up @@ -4181,50 +4181,105 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,

SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!RHS)
return SDValue();

SDValue RHS = N->getOperand(1);
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
unsigned ShiftAmt = RHS->getZExtValue();
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
unsigned RHSVal;

// fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
// this improves the ability to match BFE patterns in isel.
if (LHS.getOpcode() == ISD::AND) {
if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
unsigned MaskIdx, MaskLen;
if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
MaskIdx == ShiftAmt) {
return DAG.getNode(
ISD::AND, SL, VT,
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
if (CRHS) {
RHSVal = CRHS->getZExtValue();

// fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we do this in DAG combine? This seems to be target dependent.

// this improves the ability to match BFE patterns in isel.
if (LHS.getOpcode() == ISD::AND) {
if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
unsigned MaskIdx, MaskLen;
if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
MaskIdx == RHSVal) {
return DAG.getNode(ISD::AND, SL, VT,
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
N->getOperand(1)),
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
N->getOperand(1)));
}
}
}
}

if (VT != MVT::i64)
if (VT.getScalarType() != MVT::i64)
return SDValue();

if (ShiftAmt < 32)
// for C >= 32
// i64 (srl x, C) -> (build_pair (srl hi_32(x), C -32), 0)

// On some subtargets, 64-bit shift is a quarter rate instruction. In the
// common case, splitting this into a move and a 32-bit shift is faster and
// the same code size.
KnownBits Known = DAG.computeKnownBits(RHS);

EVT ElementType = VT.getScalarType();
EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
: TargetScalarType;

if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
return SDValue();

// srl i64:x, C for C >= 32
// =>
// build_pair (srl hi_32(x), C - 32), 0
SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
SDValue ShiftAmt;
if (CRHS) {
ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
TargetType);
} else {
SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
const SDValue ShiftMask =
DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
// This AND instruction will clamp out of bounds shift values.
// It will also be removed during later instruction selection.
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
}

const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
EVT ConcatType;
SDValue Hi;
SDLoc LHSSL(LHS);
// Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
if (VT.isVector()) {
unsigned NElts = TargetType.getVectorNumElements();
ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
SmallVector<SDValue, 8> HiOps(NElts);
SmallVector<SDValue, 16> HiAndLoOps;

SDValue Hi = getHiHalf64(LHS, DAG);
DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
for (unsigned I = 0; I != NElts; ++I)
HiOps[I] = HiAndLoOps[2 * I + 1];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can simply use insert.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel array indexing is clearer than using an insert. For reference look at the last commit #132964, which switched from insert to array indexing to address feedback.

Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
} else {
const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
}

SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
SDValue NewShift = DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt);

SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
SDValue Vec;
if (VT.isVector()) {
unsigned NElts = TargetType.getVectorNumElements();
SmallVector<SDValue, 8> LoOps;
SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);

return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
for (unsigned I = 0; I != NElts; ++I)
HiAndLoOps[2 * I] = LoOps[I];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similarly, insert

Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
} else {
Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
}
return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
}

SDValue AMDGPUTargetLowering::performTruncateCombine(
Expand Down Expand Up @@ -5209,21 +5264,19 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,

break;
}
case ISD::SHL: {
case ISD::SHL:
case ISD::SRL: {
// Range metadata can be invalidated when loads are converted to legal types
// (e.g. v2i64 -> v4i32).
// Try to convert vector shl before type legalization so that range metadata
// can be utilized.
// Try to convert vector shl/srl before type legalization so that range
// metadata can be utilized.
if (!(N->getValueType(0).isVector() &&
DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
DCI.getDAGCombineLevel() < AfterLegalizeDAG)
break;
return performShlCombine(N, DCI);
}
case ISD::SRL: {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
break;

if (N->getOpcode() == ISD::SHL) {
return performShlCombine(N, DCI);
}
return performSrlCombine(N, DCI);
}
case ISD::SRA: {
Expand Down
77 changes: 30 additions & 47 deletions llvm/test/CodeGen/AMDGPU/mad_64_32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1945,16 +1945,14 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
; CI-LABEL: lshr_mad_i64_vec:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v6, v3
; CI-NEXT: v_mov_b32_e32 v3, v1
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 s4, 0xffff1c18
; CI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, v1
; CI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
; CI-NEXT: s_mov_b32 s4, 0xffff1118
; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
; CI-NEXT: v_sub_i32_e32 v1, vcc, v5, v1
; CI-NEXT: v_sub_i32_e32 v3, vcc, v7, v3
; CI-NEXT: v_mov_b32_e32 v0, v4
; CI-NEXT: v_mov_b32_e32 v1, v5
; CI-NEXT: v_mov_b32_e32 v2, v6
; CI-NEXT: s_setpc_b64 s[30:31]
;
; SI-LABEL: lshr_mad_i64_vec:
Expand All @@ -1977,44 +1975,28 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
; GFX9-LABEL: lshr_mad_i64_vec:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v6, v3
; GFX9-NEXT: v_mov_b32_e32 v3, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b32 s4, 0xffff1c18
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
; GFX9-NEXT: s_mov_b32 s4, 0xffff1118
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1
; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3
; GFX9-NEXT: v_mov_b32_e32 v0, v4
; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: v_mov_b32_e32 v2, v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-LABEL: lshr_mad_i64_vec:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_mov_b32_e32 v8, v3
; GFX1100-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v1, 0
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_mad_u64_u32 v[4:5], null, 0xffff1c18, v6, v[0:1]
; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, v4
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_mad_u64_u32 v[6:7], null, 0xffff1118, v8, v[2:3]
; GFX1100-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-NEXT: v_mov_b32_e32 v3, v7
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX1150-LABEL: lshr_mad_i64_vec:
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1150-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
; GFX1150-NEXT: v_mov_b32_e32 v1, 0
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1150-NEXT: v_mov_b32_e32 v3, v1
; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1150-NEXT: v_mad_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
; GFX1150-NEXT: s_setpc_b64 s[30:31]
; GFX11-LABEL: lshr_mad_i64_vec:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, v5, v1
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, v7, v3
; GFX11-NEXT: v_mov_b32_e32 v2, v6
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: lshr_mad_i64_vec:
; GFX12: ; %bb.0:
Expand All @@ -2023,13 +2005,14 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mov_b32_e32 v3, v1
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
; GFX12-NEXT: v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3
; GFX12-NEXT: v_mov_b32_e32 v2, v6
; GFX12-NEXT: s_setpc_b64 s[30:31]
%lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
%mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>
Expand Down
Loading
Loading