Skip to content

Commit 5e6564b

Browse files
authored
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of 64-bit wide instructions (#140694)
- Enable s_or_b64/s_and_b64/s_xor_b64 for v2i32. Add various additional combines to make use of these newly legalised instructions. - Update several tests and separate legacy r600 tests where necessary.
1 parent 3eedaa8 commit 5e6564b

20 files changed

+2036
-1299
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3302,29 +3302,52 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
33023302
if (IsCanonicalizing)
33033303
return true;
33043304

3305-
unsigned Opc = Src->getOpcode();
3305+
// v2i32 xor/or/and are legal. A vselect using these instructions as operands
3306+
// is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3307+
// through the extract to the bitwise op.
3308+
SDValue PeekSrc =
3309+
Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3310+
// Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3311+
// types as the codegen replaces the operand without adding a srcmod.
3312+
// This is intentionally finding the cases where we are performing float neg
3313+
// and abs on int types, the goal is not to obtain two's complement neg or
3314+
// abs.
3315+
// TODO: Add 16-bit support.
3316+
unsigned Opc = PeekSrc.getOpcode();
33063317
EVT VT = Src.getValueType();
33073318
if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3308-
(VT != MVT::i32 && VT != MVT::i64))
3319+
(VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
33093320
return true;
33103321

3311-
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1));
3322+
ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
33123323
if (!CRHS)
33133324
return true;
33143325

3315-
// Recognise (xor a, 0x80000000) as NEG SrcMod.
3316-
// Recognise (and a, 0x7fffffff) as ABS SrcMod.
3317-
// Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers.
3326+
auto ReplaceSrc = [&]() -> SDValue {
3327+
if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3328+
return Src.getOperand(0);
3329+
3330+
SDValue LHS = PeekSrc->getOperand(0);
3331+
SDValue Index = Src->getOperand(1);
3332+
return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3333+
Src.getValueType(), LHS, Index);
3334+
};
3335+
3336+
// Recognise Srcmods:
3337+
// (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3338+
// (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3339+
// (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3340+
// SrcModifiers.
33183341
if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
33193342
Mods |= SISrcMods::NEG;
3320-
Src = Src.getOperand(0);
3343+
Src = ReplaceSrc();
33213344
} else if (Opc == ISD::AND && AllowAbs &&
33223345
CRHS->getAPIntValue().isMaxSignedValue()) {
33233346
Mods |= SISrcMods::ABS;
3324-
Src = Src.getOperand(0);
3347+
Src = ReplaceSrc();
33253348
} else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
33263349
Mods |= SISrcMods::ABS | SISrcMods::NEG;
3327-
Src = Src.getOperand(0);
3350+
Src = ReplaceSrc();
33283351
}
33293352

33303353
return true;

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4119,8 +4119,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
41194119
if (VT.getScalarType() != MVT::i64)
41204120
return SDValue();
41214121

4122-
// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4123-
41244122
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
41254123
// common case, splitting this into a move and a 32-bit shift is faster and
41264124
// the same code size.
@@ -4210,12 +4208,12 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
42104208
(ElementType.getSizeInBits() - 1)) {
42114209
ShiftAmt = ShiftFullAmt;
42124210
} else {
4213-
SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4211+
SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
42144212
const SDValue ShiftMask =
42154213
DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
42164214
// This AND instruction will clamp out of bounds shift values.
42174215
// It will also be removed during later instruction selection.
4218-
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
4216+
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
42194217
}
42204218

42214219
EVT ConcatType;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
440440
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441441
}
442442

443+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
444+
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
445+
// instead lower to cndmask in SITargetLowering::LowerSELECT().
446+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
447+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
448+
// alignbit.
449+
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
450+
443451
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
444452
Custom);
445453

@@ -6528,6 +6536,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
65286536
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
65296537
}
65306538

6539+
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6540+
// regression whereby extra unnecessary instructions were added to codegen
6541+
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6542+
// instructions to extract the result from the vector.
6543+
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
6544+
[[maybe_unused]] EVT VT = Op.getValueType();
6545+
6546+
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6547+
VT == MVT::v16i32) &&
6548+
"Unexpected ValueType.");
6549+
6550+
return DAG.UnrollVectorOp(Op.getNode());
6551+
}
6552+
65316553
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
65326554
// wider vector type is legal.
65336555
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6719,6 +6741,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
67196741
return lowerGET_FPENV(Op, DAG);
67206742
case ISD::SET_FPENV:
67216743
return lowerSET_FPENV(Op, DAG);
6744+
case ISD::ROTR:
6745+
return lowerROTR(Op, DAG);
67226746
}
67236747
return SDValue();
67246748
}
@@ -13801,6 +13825,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1380113825
}
1380213826
}
1380313827

13828+
// Detect identity v2i32 OR and replace with identity source node.
13829+
// Specifically an Or that has operands constructed from the same source node
13830+
// via extract_vector_elt and build_vector. I.E.
13831+
// v2i32 or(
13832+
// v2i32 build_vector(
13833+
// i32 extract_elt(%IdentitySrc, 0),
13834+
// i32 0
13835+
// ),
13836+
// v2i32 build_vector(
13837+
// i32 0,
13838+
// i32 extract_elt(%IdentitySrc, 1)
13839+
// ) )
13840+
// =>
13841+
// v2i32 %IdentitySrc
13842+
13843+
if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
13844+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
13845+
13846+
ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13847+
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
13848+
13849+
// Test for and normalise build vectors.
13850+
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
13851+
13852+
// Get the extract_vector_element operands.
13853+
SDValue LEVE = LHS->getOperand(0);
13854+
SDValue REVE = RHS->getOperand(1);
13855+
13856+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13857+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13858+
// Check that different elements from the same vector are
13859+
// extracted.
13860+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
13861+
LEVE->getOperand(1) != REVE->getOperand(1)) {
13862+
SDValue IdentitySrc = LEVE.getOperand(0);
13863+
return IdentitySrc;
13864+
}
13865+
}
13866+
}
13867+
}
13868+
1380413869
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1380513870
return SDValue();
1380613871

@@ -13848,7 +13913,7 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1384813913
SDValue LHS = N->getOperand(0);
1384913914
SDValue RHS = N->getOperand(1);
1385013915

13851-
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13916+
const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
1385213917
SelectionDAG &DAG = DCI.DAG;
1385313918

1385413919
EVT VT = N->getValueType(0);
@@ -13858,6 +13923,23 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1385813923
return Split;
1385913924
}
1386013925

13926+
// v2i32 (xor (vselect cc, x, y), K) ->
13927+
// (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
13928+
// replaced with source modifiers when the select is lowered to CNDMASK.
13929+
unsigned Opc = LHS.getOpcode();
13930+
if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
13931+
(Opc == ISD::SELECT && VT == MVT::i64)) &&
13932+
CRHS && CRHS->getAPIntValue().isSignMask()) {
13933+
SDValue CC = LHS->getOperand(0);
13934+
SDValue TRUE = LHS->getOperand(1);
13935+
SDValue FALSE = LHS->getOperand(2);
13936+
SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
13937+
SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
13938+
SDValue XSelect =
13939+
DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
13940+
return XSelect;
13941+
}
13942+
1386113943
// Make sure to apply the 64-bit constant splitting fold before trying to fold
1386213944
// fneg-like xors into 64-bit select.
1386313945
if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
@@ -14848,6 +14930,27 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
1484814930
return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
1484914931
}
1485014932

14933+
// (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
14934+
// -> (and (extract_vector_element {y0, y1}, index), 0x1f)
14935+
// There are optimisations to transform 64-bit shifts into 32-bit shifts
14936+
// depending on the shift operand. See e.g. performSraCombine().
14937+
// This combine ensures that the optimisation is compatible with v2i32
14938+
// legalised AND.
14939+
if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
14940+
Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
14941+
14942+
const ConstantSDNode *C = isConstOrConstSplat(Vec.getOperand(1));
14943+
if (!C || C->getZExtValue() != 0x1f)
14944+
return SDValue();
14945+
14946+
SDLoc SL(N);
14947+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
14948+
SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
14949+
Vec->getOperand(0), N->getOperand(1));
14950+
SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
14951+
DAG.ReplaceAllUsesWith(N, A.getNode());
14952+
}
14953+
1485114954
// ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
1485214955
// =>
1485314956
// Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
444444
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
445445
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
446446
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
447+
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
447448

448449
Register getRegisterByName(const char* RegName, LLT VT,
449450
const MachineFunction &MF) const override;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2614,9 +2614,9 @@ def : AMDGPUPatIgnoreCopies <
26142614
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
26152615
>;
26162616

2617-
// 64-bit version
2617+
foreach vt = [i64, v2i32] in {
26182618
def : AMDGPUPatIgnoreCopies <
2619-
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
2619+
(DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
26202620
(REG_SEQUENCE VReg_64,
26212621
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
26222622
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
@@ -2625,6 +2625,7 @@ def : AMDGPUPatIgnoreCopies <
26252625
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
26262626
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
26272627
>;
2628+
}
26282629

26292630
def : AMDGPUPat <
26302631
(fcopysign f32:$src0, f32:$src1),

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1887,6 +1887,21 @@ def : GCNPat <
18871887
(S_MOV_B32 imm:$imm)
18881888
>;
18891889

1890+
def : GCNPat <
1891+
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
1892+
(S_AND_B64 SReg_64:$x, SReg_64:$y)
1893+
>;
1894+
1895+
def : GCNPat <
1896+
(v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
1897+
(S_OR_B64 SReg_64:$x, SReg_64:$y)
1898+
>;
1899+
1900+
def : GCNPat <
1901+
(v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
1902+
(S_XOR_B64 SReg_64:$x, SReg_64:$y)
1903+
>;
1904+
18901905
// Same as a 32-bit inreg
18911906
def : GCNPat<
18921907
(i32 (UniformUnaryFrag<sext> i16:$src)),

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1024,9 +1024,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
10241024
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
10251025
def : DivergentBinOp<sube, V_SUBB_U32_e32>;
10261026

1027-
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
1027+
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
10281028
GCNPat<
1029-
(DivergentBinFrag<Op> i64:$src0, i64:$src1),
1029+
(DivergentBinFrag<Op> vt:$src0, vt:$src1),
10301030
(REG_SEQUENCE VReg_64,
10311031
(Inst
10321032
(i32 (EXTRACT_SUBREG $src0, sub0)),
@@ -1043,6 +1043,10 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
10431043
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
10441044
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
10451045

1046+
def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
1047+
def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
1048+
def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;
1049+
10461050
// mul24 w/ 64 bit output.
10471051
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
10481052
(i64 (Op i32:$src0, i32:$src1)),

0 commit comments

Comments
 (0)