Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c4dcb8c
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
chrisjbris May 20, 2025
ce16329
Remove over-enthusiastic clang-format
chrisjbris Jun 19, 2025
a54235d
Respond to some review comments
chrisjbris Jun 23, 2025
45da1d1
Add reviewer requested tests
chrisjbris Jun 23, 2025
c3cb854
Suppress over-enthusiastic clang-format
chrisjbris Jun 23, 2025
ad83e03
Temporarily remove r600 from or.ll test
chrisjbris Jun 23, 2025
9ce5bdd
Add SGPR and VGPR tests to and.ll and temporarily remove the r600 run…
chrisjbris Jun 24, 2025
0fc0980
Remove dead check-lines from or.ll
chrisjbris Jun 24, 2025
5a6e6e4
Apply reviewer comments to performFNegCombine
chrisjbris Jun 24, 2025
f2fd7d2
Remove dead code
chrisjbris Jun 24, 2025
4f19d19
Re-enstate r600 tests in independent files. This action has already t…
chrisjbris Jun 24, 2025
5f3e156
Remove unhelpful commentary.
chrisjbris Jun 25, 2025
3e0fe34
Remove unnecessary driveby clang-format
chrisjbris Jun 25, 2025
89ea0ae
Remove dead checks in xor.ll
chrisjbris Jun 25, 2025
75c72c8
Remove unnnecessary node duplication
chrisjbris Jun 25, 2025
57dafa9
Remove dead xorcombine.
chrisjbris Aug 7, 2025
0a05bba
Work to fix regressions in integer select srcmod generation when v2i32
chrisjbris Jul 18, 2025
f10e16f
Correct clang-format
chrisjbris Aug 7, 2025
51e44b8
Remove unnecessary code reordering
chrisjbris Aug 7, 2025
a0b2a91
clang-format
chrisjbris Aug 7, 2025
fe8b60e
clang-format
chrisjbris Aug 7, 2025
309e2f9
Simplify ReplaceSrc()
chrisjbris Aug 12, 2025
7815116
Remove dead ternary expression from AMDGPUDAGToDAG.
chrisjbris Aug 14, 2025
bc4813a
Change unnecessary use of PeekSrc to Src in ReplaceSrc().
chrisjbris Aug 14, 2025
5791b10
Update comment for v2i32 case.
chrisjbris Aug 14, 2025
82e51ed
Apply reviewer corrections to getShiftForReduction()
chrisjbris Aug 15, 2025
b4fcd59
Update getShiftForReduction() to type check and add comment.
chrisjbris Aug 15, 2025
a8668b7
Remove dead truncate, correct flags() call and add detail to comment.
chrisjbris Sep 5, 2025
151460d
Correct comment.
chrisjbris Sep 5, 2025
90fa0e7
Updated test.
chrisjbris Sep 5, 2025
81cf80c
Replace getShiftForeReduction with an additional combine in
chrisjbris Sep 7, 2025
5736525
Correct formatting.
chrisjbris Sep 7, 2025
1943ddc
Remove deprecated getShiftForRefuction() function
chrisjbris Sep 7, 2025
4fc323e
Add TODO to performExtractVectorEltCombine()
chrisjbris Sep 8, 2025
c9c5967
Simplify combine with isConstorConstSplat().
chrisjbris Sep 9, 2025
deb29c8
Further simplify extract_vector_elt combine
chrisjbris Sep 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 32 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3302,29 +3302,52 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
if (IsCanonicalizing)
return true;

unsigned Opc = Src->getOpcode();
// v2i32 xor/or/and are legal. A vselect using these instructions as operands
// is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
// through the extract to the bitwise op.
SDValue PeekSrc =
Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
// Convert various sign-bit masks to src mods. Currently disabled for 16-bit
// types as the codegen replaces the operand without adding a srcmod.
// This is intentionally finding the cases where we are performing float neg
// and abs on int types, the goal is not to obtain two's complement neg or
// abs.
// TODO: Add 16-bit support.
unsigned Opc = PeekSrc.getOpcode();
EVT VT = Src.getValueType();
if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
(VT != MVT::i32 && VT != MVT::i64))
(VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
return true;

ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1));
ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
if (!CRHS)
return true;

// Recognise (xor a, 0x80000000) as NEG SrcMod.
// Recognise (and a, 0x7fffffff) as ABS SrcMod.
// Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers.
auto ReplaceSrc = [&]() -> SDValue {
if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return Src.getOperand(0);

SDValue LHS = PeekSrc->getOperand(0);
SDValue Index = Src->getOperand(1);
return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
Src.getValueType(), LHS, Index);
};

// Recognise Srcmods:
// (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
// (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
// (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
// SrcModifiers.
if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
Mods |= SISrcMods::NEG;
Src = Src.getOperand(0);
Src = ReplaceSrc();
} else if (Opc == ISD::AND && AllowAbs &&
CRHS->getAPIntValue().isMaxSignedValue()) {
Mods |= SISrcMods::ABS;
Src = Src.getOperand(0);
Src = ReplaceSrc();
} else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
Mods |= SISrcMods::ABS | SISrcMods::NEG;
Src = Src.getOperand(0);
Src = ReplaceSrc();
}

return true;
Expand Down
6 changes: 2 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4119,8 +4119,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
if (VT.getScalarType() != MVT::i64)
return SDValue();

// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))

// On some subtargets, 64-bit shift is a quarter rate instruction. In the
// common case, splitting this into a move and a 32-bit shift is faster and
// the same code size.
Expand Down Expand Up @@ -4210,12 +4208,12 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
(ElementType.getSizeInBits() - 1)) {
ShiftAmt = ShiftFullAmt;
} else {
SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
const SDValue ShiftMask =
DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
// This AND instruction will clamp out of bounds shift values.
// It will also be removed during later instruction selection.
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
}

EVT ConcatType;
Expand Down
105 changes: 104 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
}

setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
// instead lower to cndmask in SITargetLowering::LowerSELECT().
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
// alignbit.
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);

setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
Custom);

Expand Down Expand Up @@ -6272,6 +6280,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}

// Enable lowering of ROTR for vxi32 types. This is a workaround for a
// regression whereby extra unnecessary instructions were added to codegen
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
// instructions to extract the result from the vector.
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
[[maybe_unused]] EVT VT = Op.getValueType();

assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
VT == MVT::v16i32) &&
"Unexpected ValueType.");

return DAG.UnrollVectorOp(Op.getNode());
}

// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
// wider vector type is legal.
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
Expand Down Expand Up @@ -6463,6 +6485,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerGET_FPENV(Op, DAG);
case ISD::SET_FPENV:
return lowerSET_FPENV(Op, DAG);
case ISD::ROTR:
return lowerROTR(Op, DAG);
}
return SDValue();
}
Expand Down Expand Up @@ -13545,6 +13569,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
}

// Detect identity v2i32 OR and replace with identity source node.
// Specifically an Or that has operands constructed from the same source node
// via extract_vector_elt and build_vector. I.E.
// v2i32 or(
// v2i32 build_vector(
// i32 extract_elt(%IdentitySrc, 0),
// i32 0
// ),
// v2i32 build_vector(
// i32 0,
// i32 extract_elt(%IdentitySrc, 1)
// ) )
// =>
// v2i32 %IdentitySrc

if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
RHS->getOpcode() == ISD::BUILD_VECTOR) {

ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));

// Test for and normalise build vectors.
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {

// Get the extract_vector_element operands.
SDValue LEVE = LHS->getOperand(0);
SDValue REVE = RHS->getOperand(1);

if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
// Check that different elements from the same vector are
// extracted.
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
LEVE->getOperand(1) != REVE->getOperand(1)) {
SDValue IdentitySrc = LEVE.getOperand(0);
return IdentitySrc;
}
}
}
}

if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
return SDValue();

Expand Down Expand Up @@ -13592,7 +13657,7 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);

const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
SelectionDAG &DAG = DCI.DAG;

EVT VT = N->getValueType(0);
Expand All @@ -13602,6 +13667,23 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
return Split;
}

// v2i32 (xor (vselect cc, x, y), K) ->
// (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
// replaced with source modifiers when the select is lowered to CNDMASK.
unsigned Opc = LHS.getOpcode();
if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
(Opc == ISD::SELECT && VT == MVT::i64)) &&
CRHS && CRHS->getAPIntValue().isSignMask()) {
SDValue CC = LHS->getOperand(0);
SDValue TRUE = LHS->getOperand(1);
SDValue FALSE = LHS->getOperand(2);
SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
SDValue XSelect =
DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
return XSelect;
}

// Make sure to apply the 64-bit constant splitting fold before trying to fold
// fneg-like xors into 64-bit select.
if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
Expand Down Expand Up @@ -14592,6 +14674,27 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
}

// (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
// -> (and (extract_vector_element {y0, y1}, index), 0x1f)
// There are optimisations to transform 64-bit shifts into 32-bit shifts
// depending on the shift operand. See e.g. performSraCombine().
// This combine ensures that the optimisation is compatible with v2i32
// legalised AND.
if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {

const ConstantSDNode *C = isConstOrConstSplat(Vec.getOperand(1));
if (!C || C->getZExtValue() != 0x1f)
return SDValue();

SDLoc SL(N);
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
Vec->getOperand(0), N->getOperand(1));
SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
DAG.ReplaceAllUsesWith(N, A.getNode());
}

// ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
// =>
// Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;

Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -2585,9 +2585,9 @@ def : AMDGPUPatIgnoreCopies <
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
>;

// 64-bit version
foreach vt = [i64, v2i32] in {
def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
(DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
(REG_SEQUENCE VReg_64,
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
Expand All @@ -2596,6 +2596,7 @@ def : AMDGPUPatIgnoreCopies <
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
}

def : AMDGPUPat <
(fcopysign f32:$src0, f32:$src1),
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,21 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;

def : GCNPat <
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
(S_AND_B64 SReg_64:$x, SReg_64:$y)
>;

def : GCNPat <
(v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
(S_OR_B64 SReg_64:$x, SReg_64:$y)
>;

def : GCNPat <
(v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
(S_XOR_B64 SReg_64:$x, SReg_64:$y)
>;

// Same as a 32-bit inreg
def : GCNPat<
(i32 (UniformUnaryFrag<sext> i16:$src)),
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1024,9 +1024,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
def : DivergentBinOp<sube, V_SUBB_U32_e32>;

class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
GCNPat<
(DivergentBinFrag<Op> i64:$src0, i64:$src1),
(DivergentBinFrag<Op> vt:$src0, vt:$src1),
(REG_SEQUENCE VReg_64,
(Inst
(i32 (EXTRACT_SUBREG $src0, sub0)),
Expand All @@ -1043,6 +1043,10 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;

def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;

// mul24 w/ 64 bit output.
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
(i64 (Op i32:$src0, i32:$src1)),
Expand Down
Loading