Skip to content

Commit 7e9d17a

Browse files
committed
Fix 64-bit ashr scalarisation of and for fold int 32-bit shift
1 parent 247bf50 commit 7e9d17a

File tree

2 files changed

+112
-37
lines changed

2 files changed

+112
-37
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 101 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4222,6 +4222,49 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
42224222
SelectionDAG &DAG = DCI.DAG;
42234223
SDLoc SL(N);
42244224

4225+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4226+
SDValue VAND = RHS.getOperand(0);
4227+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4228+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4229+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4230+
SDValue LHSAND = VAND.getOperand(0);
4231+
SDValue RHSAND = VAND.getOperand(1);
4232+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4233+
// Part of sracombine is to optimise for the case where its possible
4234+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4235+
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4236+
// '&' is then elided by ISel. The vector code for this was being
4237+
// completely scalarised by the vector legalizer, but now v2i32 is
4238+
// made legal the vector legaliser only partially scalarises the
4239+
// vector operations and the and was not elided. This check enables us
4240+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4241+
// the and instruction.
4242+
ConstantSDNode *CANDL =
4243+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4244+
ConstantSDNode *CANDR =
4245+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4246+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4247+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4248+
// Get the non-const AND operands and produce scalar AND
4249+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4250+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4251+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4252+
LHSAND, Zero);
4253+
SDValue Hi =
4254+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4255+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4256+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4257+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4258+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4259+
if (AndIndex == 0 || AndIndex == 1)
4260+
return DAG.getNode(ISD::SRA, SL, MVT::i32, Trunc,
4261+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4262+
}
4263+
}
4264+
}
4265+
}
4266+
}
4267+
42254268
if (VT.getScalarType() != MVT::i64)
42264269
return SDValue();
42274270

@@ -4314,8 +4357,63 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
43144357
return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
43154358
}
43164359

4317-
static SDValue getScalarisedShift(SDValue LHS, SDValue RHS, SelectionDAG &DAG) {
4318-
SDLoc SL = SDLoc(RHS);
4360+
// static SDValue getScalarisedShift(SDValue LHS, SDValue RHS, SelectionDAG &DAG) {
4361+
// SDLoc SL = SDLoc(RHS);
4362+
// if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4363+
// SDValue VAND = RHS.getOperand(0);
4364+
// if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4365+
// uint64_t AndIndex = RHS->getConstantOperandVal(1);
4366+
// if (VAND->getOpcode() == ISD::AND && CRRHS) {
4367+
// SDValue LHSAND = VAND.getOperand(0);
4368+
// SDValue RHSAND = VAND.getOperand(1);
4369+
// if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4370+
// // Part of srlcombine is to optimise for the case where its possible
4371+
// // to reduce shl64 to shl32 if shift range is [63-32]. This
4372+
// // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4373+
// // '&' is then elided by ISel. The vector code for this was being
4374+
// // completely scalarised by the vector legalizer, but now v2i32 is
4375+
// // made legal the vector legaliser only partially scalarises the
4376+
// // vector operations and the and was not elided. This check enables us
4377+
// // to locate and scalarise the v2i32 and and re-enable ISel to elide
4378+
// // the and instruction.
4379+
// ConstantSDNode *CANDL =
4380+
// dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4381+
// ConstantSDNode *CANDR =
4382+
// dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4383+
// if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4384+
// RHSAND->getConstantOperandVal(1) == 0x1f) {
4385+
// // Get the non-const AND operands and produce scalar AND
4386+
// const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4387+
// const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4388+
// SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4389+
// LHSAND, Zero);
4390+
// SDValue Hi =
4391+
// DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4392+
// SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4393+
// SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4394+
// SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4395+
// SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4396+
// if (AndIndex == 0 || AndIndex == 1)
4397+
// return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
4398+
// AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags());
4399+
// }
4400+
// }
4401+
// }
4402+
// }
4403+
// }
4404+
// return SDValue();
4405+
// }
4406+
4407+
SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4408+
DAGCombinerInfo &DCI) const {
4409+
SDValue RHS = N->getOperand(1);
4410+
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4411+
EVT VT = N->getValueType(0);
4412+
SDValue LHS = N->getOperand(0);
4413+
SelectionDAG &DAG = DCI.DAG;
4414+
SDLoc SL(N);
4415+
unsigned RHSVal;
4416+
43194417
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
43204418
SDValue VAND = RHS.getOperand(0);
43214419
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
@@ -4352,25 +4450,12 @@ static SDValue getScalarisedShift(SDValue LHS, SDValue RHS, SelectionDAG &DAG) {
43524450
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
43534451
if (AndIndex == 0 || AndIndex == 1)
43544452
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
4355-
AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags());
4453+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
43564454
}
43574455
}
43584456
}
43594457
}
43604458
}
4361-
return SDValue();
4362-
}
4363-
4364-
SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4365-
DAGCombinerInfo &DCI) const {
4366-
SDValue RHS = N->getOperand(1);
4367-
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4368-
EVT VT = N->getValueType(0);
4369-
SDValue LHS = N->getOperand(0);
4370-
SelectionDAG &DAG = DCI.DAG;
4371-
SDLoc SL(N);
4372-
unsigned RHSVal;
4373-
43744459

43754460

43764461
if (CRHS) {

llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,8 @@ define <2 x i64> @ashr_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) {
112112
; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1
113113
; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3
114114
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
115-
; CHECK-NEXT: v_and_b32_e32 v2, 31, v8
116-
; CHECK-NEXT: v_and_b32_e32 v0, 31, v6
117-
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1
118-
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3
115+
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v6, v1
116+
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v8, v3
119117
; CHECK-NEXT: v_mov_b32_e32 v1, v5
120118
; CHECK-NEXT: v_mov_b32_e32 v3, v4
121119
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -147,10 +145,8 @@ define <2 x i64> @ashr_exact_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) {
147145
; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1
148146
; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3
149147
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
150-
; CHECK-NEXT: v_and_b32_e32 v2, 31, v8
151-
; CHECK-NEXT: v_and_b32_e32 v0, 31, v6
152-
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1
153-
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3
148+
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v6, v1
149+
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v8, v3
154150
; CHECK-NEXT: v_mov_b32_e32 v1, v5
155151
; CHECK-NEXT: v_mov_b32_e32 v3, v4
156152
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -394,11 +390,9 @@ define <2 x i64> @ashr_v2_or32(<2 x i64> %arg0, <2 x i64> %shift_amt) {
394390
; CHECK-LABEL: ashr_v2_or32:
395391
; CHECK: ; %bb.0:
396392
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
397-
; CHECK-NEXT: v_and_b32_e32 v2, 31, v6
398-
; CHECK-NEXT: v_and_b32_e32 v0, 31, v4
399-
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1
400-
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3
393+
; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v1
401394
; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v1
395+
; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v3
402396
; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v3
403397
; CHECK-NEXT: s_setpc_b64 s[30:31]
404398
%or = or <2 x i64> %shift_amt, splat (i64 32)
@@ -471,17 +465,13 @@ define <2 x i64> @ashr_v2_or32_sgpr(<2 x i64> inreg %arg0, <2 x i64> inreg %shif
471465
; CHECK-LABEL: ashr_v2_or32_sgpr:
472466
; CHECK: ; %bb.0:
473467
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474-
; CHECK-NEXT: s_mov_b32 s4, 31
475-
; CHECK-NEXT: s_mov_b32 s21, s22
476-
; CHECK-NEXT: s_mov_b32 s5, s4
477-
; CHECK-NEXT: s_and_b64 s[4:5], s[20:21], s[4:5]
478-
; CHECK-NEXT: s_ashr_i32 s6, s17, 31
468+
; CHECK-NEXT: s_ashr_i32 s4, s17, s20
469+
; CHECK-NEXT: s_ashr_i32 s5, s17, 31
470+
; CHECK-NEXT: s_ashr_i32 s6, s19, s22
479471
; CHECK-NEXT: s_ashr_i32 s7, s19, 31
480-
; CHECK-NEXT: s_ashr_i32 s4, s17, s4
481-
; CHECK-NEXT: s_ashr_i32 s5, s19, s5
482472
; CHECK-NEXT: v_mov_b32_e32 v0, s4
483-
; CHECK-NEXT: v_mov_b32_e32 v1, s6
484-
; CHECK-NEXT: v_mov_b32_e32 v2, s5
473+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
474+
; CHECK-NEXT: v_mov_b32_e32 v2, s6
485475
; CHECK-NEXT: v_mov_b32_e32 v3, s7
486476
; CHECK-NEXT: s_setpc_b64 s[30:31]
487477
%or = or <2 x i64> %shift_amt, splat (i64 32)

0 commit comments

Comments
 (0)