Skip to content

Commit 216f7c0

Browse files
committed
Fix ashr scalarisation and update ptradd-sdag-optimizations.ll with latest tests
1 parent 2192997 commit 216f7c0

File tree

3 files changed

+205
-140
lines changed

3 files changed

+205
-140
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 101 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4220,6 +4220,49 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
42204220
SelectionDAG &DAG = DCI.DAG;
42214221
SDLoc SL(N);
42224222

4223+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4224+
SDValue VAND = RHS.getOperand(0);
4225+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4226+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4227+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4228+
SDValue LHSAND = VAND.getOperand(0);
4229+
SDValue RHSAND = VAND.getOperand(1);
4230+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4231+
// Part of sracombine is to optimise for the case where its possible
4232+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4233+
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4234+
// '&' is then elided by ISel. The vector code for this was being
4235+
// completely scalarised by the vector legalizer, but now v2i32 is
4236+
// made legal the vector legaliser only partially scalarises the
4237+
// vector operations and the and was not elided. This check enables us
4238+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4239+
// the and instruction.
4240+
ConstantSDNode *CANDL =
4241+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4242+
ConstantSDNode *CANDR =
4243+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4244+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4245+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4246+
// Get the non-const AND operands and produce scalar AND
4247+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4248+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4249+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4250+
LHSAND, Zero);
4251+
SDValue Hi =
4252+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4253+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4254+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4255+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4256+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4257+
if (AndIndex == 0 || AndIndex == 1)
4258+
return DAG.getNode(ISD::SRA, SL, MVT::i32, Trunc,
4259+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4260+
}
4261+
}
4262+
}
4263+
}
4264+
}
4265+
42234266
if (VT.getScalarType() != MVT::i64)
42244267
return SDValue();
42254268

@@ -4312,8 +4355,63 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
43124355
return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
43134356
}
43144357

4315-
static SDValue getScalarisedShift(SDValue LHS, SDValue RHS, SelectionDAG &DAG) {
4316-
SDLoc SL = SDLoc(RHS);
4358+
// static SDValue getScalarisedShift(SDValue LHS, SDValue RHS, SelectionDAG &DAG) {
4359+
// SDLoc SL = SDLoc(RHS);
4360+
// if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4361+
// SDValue VAND = RHS.getOperand(0);
4362+
// if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4363+
// uint64_t AndIndex = RHS->getConstantOperandVal(1);
4364+
// if (VAND->getOpcode() == ISD::AND && CRRHS) {
4365+
// SDValue LHSAND = VAND.getOperand(0);
4366+
// SDValue RHSAND = VAND.getOperand(1);
4367+
// if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4368+
// // Part of srlcombine is to optimise for the case where its possible
4369+
// // to reduce shl64 to shl32 if shift range is [63-32]. This
4370+
// // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4371+
// // '&' is then elided by ISel. The vector code for this was being
4372+
// // completely scalarised by the vector legalizer, but now v2i32 is
4373+
// // made legal the vector legaliser only partially scalarises the
4374+
// // vector operations and the and was not elided. This check enables us
4375+
// // to locate and scalarise the v2i32 and and re-enable ISel to elide
4376+
// // the and instruction.
4377+
// ConstantSDNode *CANDL =
4378+
// dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4379+
// ConstantSDNode *CANDR =
4380+
// dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4381+
// if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4382+
// RHSAND->getConstantOperandVal(1) == 0x1f) {
4383+
// // Get the non-const AND operands and produce scalar AND
4384+
// const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4385+
// const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4386+
// SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4387+
// LHSAND, Zero);
4388+
// SDValue Hi =
4389+
// DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4390+
// SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4391+
// SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4392+
// SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4393+
// SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4394+
// if (AndIndex == 0 || AndIndex == 1)
4395+
// return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
4396+
// AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags());
4397+
// }
4398+
// }
4399+
// }
4400+
// }
4401+
// }
4402+
// return SDValue();
4403+
// }
4404+
4405+
SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4406+
DAGCombinerInfo &DCI) const {
4407+
SDValue RHS = N->getOperand(1);
4408+
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4409+
EVT VT = N->getValueType(0);
4410+
SDValue LHS = N->getOperand(0);
4411+
SelectionDAG &DAG = DCI.DAG;
4412+
SDLoc SL(N);
4413+
unsigned RHSVal;
4414+
43174415
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
43184416
SDValue VAND = RHS.getOperand(0);
43194417
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
@@ -4350,25 +4448,12 @@ static SDValue getScalarisedShift(SDValue LHS, SDValue RHS, SelectionDAG &DAG) {
43504448
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
43514449
if (AndIndex == 0 || AndIndex == 1)
43524450
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
4353-
AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags());
4451+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
43544452
}
43554453
}
43564454
}
43574455
}
43584456
}
4359-
return SDValue();
4360-
}
4361-
4362-
SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4363-
DAGCombinerInfo &DCI) const {
4364-
SDValue RHS = N->getOperand(1);
4365-
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4366-
EVT VT = N->getValueType(0);
4367-
SDValue LHS = N->getOperand(0);
4368-
SelectionDAG &DAG = DCI.DAG;
4369-
SDLoc SL(N);
4370-
unsigned RHSVal;
4371-
43724457

43734458

43744459
if (CRHS) {

llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,8 @@ define <2 x i64> @ashr_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) {
112112
; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1
113113
; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3
114114
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
115-
; CHECK-NEXT: v_and_b32_e32 v2, 31, v8
116-
; CHECK-NEXT: v_and_b32_e32 v0, 31, v6
117-
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1
118-
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3
115+
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v6, v1
116+
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v8, v3
119117
; CHECK-NEXT: v_mov_b32_e32 v1, v5
120118
; CHECK-NEXT: v_mov_b32_e32 v3, v4
121119
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -147,10 +145,8 @@ define <2 x i64> @ashr_exact_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) {
147145
; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1
148146
; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3
149147
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
150-
; CHECK-NEXT: v_and_b32_e32 v2, 31, v8
151-
; CHECK-NEXT: v_and_b32_e32 v0, 31, v6
152-
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1
153-
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3
148+
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v6, v1
149+
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v8, v3
154150
; CHECK-NEXT: v_mov_b32_e32 v1, v5
155151
; CHECK-NEXT: v_mov_b32_e32 v3, v4
156152
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -394,11 +390,9 @@ define <2 x i64> @ashr_v2_or32(<2 x i64> %arg0, <2 x i64> %shift_amt) {
394390
; CHECK-LABEL: ashr_v2_or32:
395391
; CHECK: ; %bb.0:
396392
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
397-
; CHECK-NEXT: v_and_b32_e32 v2, 31, v6
398-
; CHECK-NEXT: v_and_b32_e32 v0, 31, v4
399-
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1
400-
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3
393+
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v1
401394
; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v1
395+
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v3
402396
; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v3
403397
; CHECK-NEXT: s_setpc_b64 s[30:31]
404398
%or = or <2 x i64> %shift_amt, splat (i64 32)
@@ -471,17 +465,13 @@ define <2 x i64> @ashr_v2_or32_sgpr(<2 x i64> inreg %arg0, <2 x i64> inreg %shif
471465
; CHECK-LABEL: ashr_v2_or32_sgpr:
472466
; CHECK: ; %bb.0:
473467
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474-
; CHECK-NEXT: s_mov_b32 s4, 31
475-
; CHECK-NEXT: s_mov_b32 s21, s22
476-
; CHECK-NEXT: s_mov_b32 s5, s4
477-
; CHECK-NEXT: s_and_b64 s[4:5], s[20:21], s[4:5]
478-
; CHECK-NEXT: s_ashr_i32 s6, s17, 31
468+
; CHECK-NEXT: s_ashr_i32 s4, s17, s20
469+
; CHECK-NEXT: s_ashr_i32 s5, s17, 31
470+
; CHECK-NEXT: s_ashr_i32 s6, s19, s22
479471
; CHECK-NEXT: s_ashr_i32 s7, s19, 31
480-
; CHECK-NEXT: s_ashr_i32 s4, s17, s4
481-
; CHECK-NEXT: s_ashr_i32 s5, s19, s5
482472
; CHECK-NEXT: v_mov_b32_e32 v0, s4
483-
; CHECK-NEXT: v_mov_b32_e32 v1, s6
484-
; CHECK-NEXT: v_mov_b32_e32 v2, s5
473+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
474+
; CHECK-NEXT: v_mov_b32_e32 v2, s6
485475
; CHECK-NEXT: v_mov_b32_e32 v3, s7
486476
; CHECK-NEXT: s_setpc_b64 s[30:31]
487477
%or = or <2 x i64> %shift_amt, splat (i64 32)

0 commit comments

Comments
 (0)