Skip to content

[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of 64-bit wide instructions #140694

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
4cb21c3
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
chrisjbris May 20, 2025
f0a2cae
Remove over-enthusiastic clang-format
chrisjbris Jun 19, 2025
20fbed5
Respond to some review comments
chrisjbris Jun 23, 2025
3cc5ac1
Add reviewer requested tests
chrisjbris Jun 23, 2025
9b5257e
Suppress over-enthusiastic clang-format
chrisjbris Jun 23, 2025
c4fad1c
Temporarily remove r600 from or.ll test
chrisjbris Jun 23, 2025
257284b
Add SGPR and VGPR tests to and.ll and temporarily remove the r600 run…
chrisjbris Jun 24, 2025
235d6a5
Remove dead check-lines from or.ll
chrisjbris Jun 24, 2025
6046c68
Apply reviewer comments to performFNegCombine
chrisjbris Jun 24, 2025
3697bcb
Remove dead code
chrisjbris Jun 24, 2025
d93e8b2
Re-enstate r600 tests in independent files. This action has already t…
chrisjbris Jun 24, 2025
db04694
Remove unhelpful commentary.
chrisjbris Jun 25, 2025
1c7dafc
Remove unnecessary driveby clang-format
chrisjbris Jun 25, 2025
57f8903
Remove dead checks in xor.ll
chrisjbris Jun 25, 2025
45e974f
Remove unnnecessary node duplication
chrisjbris Jun 25, 2025
7b31e62
Remove dead xorcombine.
chrisjbris Aug 7, 2025
2561e72
Work to fix regressions in integer select srcmod generation when v2i32
chrisjbris Jul 18, 2025
58f703e
Correct clang-format
chrisjbris Aug 7, 2025
1faff7f
Remove unnecessary code reordering
chrisjbris Aug 7, 2025
c52fdbd
clang-format
chrisjbris Aug 7, 2025
7fff273
clang-format
chrisjbris Aug 7, 2025
701c4e9
Simplify ReplaceSrc()
chrisjbris Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3225,29 +3225,51 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
if (IsCanonicalizing)
return true;

unsigned Opc = Src->getOpcode();
// v2i32 xor/or/and are legal. A vselect using these instructions as operands
// is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
// through the extract to the bitwise op.
SDValue PeekSrc =
Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
// Convert various sign-bit masks to src mods. Currently disabled for 16-bit
// types as the codegen replaces the operand without adding a srcmod.
// This is intentionally finding the cases where we are performing float neg
// and abs on int types, the goal is not to obtain two's complement neg or
// abs.
// TODO: Add 16-bit support.
unsigned Opc = PeekSrc.getOpcode();
EVT VT = Src.getValueType();
if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
(VT != MVT::i32 && VT != MVT::i64))
(VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
return true;

ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1));
ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc ? PeekSrc->getOperand(1)
: Src->getOperand(1));
if (!CRHS)
return true;

auto ReplaceSrc = [&]() -> SDValue {
if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return PeekSrc.getOperand(0);

SDValue LHS = PeekSrc->getOperand(0);
SDValue Index = Src->getOperand(1);
return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
Src.getValueType(), LHS, Index);
};

// Recognise (xor a, 0x80000000) as NEG SrcMod.
// Recognise (and a, 0x7fffffff) as ABS SrcMod.
// Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers.
if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
Mods |= SISrcMods::NEG;
Src = Src.getOperand(0);
Src = ReplaceSrc();
} else if (Opc == ISD::AND && AllowAbs &&
CRHS->getAPIntValue().isMaxSignedValue()) {
Mods |= SISrcMods::ABS;
Src = Src.getOperand(0);
Src = ReplaceSrc();
} else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
Mods |= SISrcMods::ABS | SISrcMods::NEG;
Src = Src.getOperand(0);
Src = ReplaceSrc();
}

return true;
Expand Down
68 changes: 64 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4048,6 +4048,59 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
}

// Part of the shift combines is to optimise for the case where its possible
// to reduce e.g shl64 to shl32 if shift range is [63-32]. This
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
// '&' is then elided by ISel. The vector code for this was being
// completely scalarised by the vector legalizer, but when v2i32 is
// legal the vector legaliser only partially scalarises the
// vector operations and the and is not elided. This function
// scalarises the AND for this optimisation case.
static SDValue getShiftForReduction(unsigned ShiftOpc, SDValue LHS, SDValue RHS,
SelectionDAG &DAG) {
assert(
(ShiftOpc == ISD::SRA || ShiftOpc == ISD::SRL || ShiftOpc == ISD::SHL) &&
"Expected shift Opcode.");

SDLoc SL = SDLoc(RHS);
if (RHS->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();

SDValue VAND = RHS.getOperand(0);
if (VAND->getOpcode() != ISD::AND)
return SDValue();

ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
if (!CRRHS)
return SDValue();

SDValue LHSAND = VAND.getOperand(0);
SDValue RHSAND = VAND.getOperand(1);
if (RHSAND->getOpcode() != ISD::BUILD_VECTOR)
return SDValue();

ConstantSDNode *CANDL = dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
ConstantSDNode *CANDR = dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
if (!CANDL || !CANDR || RHSAND->getConstantOperandVal(0) != 0x1f ||
RHSAND->getConstantOperandVal(1) != 0x1f)
return SDValue();
// Get the non-const AND operands and produce scalar AND
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, Zero);
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
uint64_t AndIndex = RHS->getConstantOperandVal(1);
if (AndIndex == 0 || AndIndex == 1)
return DAG.getNode(ShiftOpc, SL, MVT::i32, Trunc,
AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags());

return SDValue();
}

SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
Expand All @@ -4057,6 +4110,9 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;

if (SDValue SS = getShiftForReduction(ISD::SHL, LHS, RHS, DAG))
return SS;

unsigned RHSVal;
if (CRHS) {
RHSVal = CRHS->getZExtValue();
Expand Down Expand Up @@ -4098,8 +4154,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
if (VT.getScalarType() != MVT::i64)
return SDValue();

// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))

// On some subtargets, 64-bit shift is a quarter rate instruction. In the
// common case, splitting this into a move and a 32-bit shift is faster and
// the same code size.
Expand Down Expand Up @@ -4159,6 +4213,9 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);

if (SDValue SS = getShiftForReduction(ISD::SRA, LHS, RHS, DAG))
return SS;

if (VT.getScalarType() != MVT::i64)
return SDValue();

Expand Down Expand Up @@ -4189,12 +4246,12 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
(ElementType.getSizeInBits() - 1)) {
ShiftAmt = ShiftFullAmt;
} else {
SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
const SDValue ShiftMask =
DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
// This AND instruction will clamp out of bounds shift values.
// It will also be removed during later instruction selection.
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
}

EVT ConcatType;
Expand Down Expand Up @@ -4261,6 +4318,9 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
SDLoc SL(N);
unsigned RHSVal;

if (SDValue SS = getShiftForReduction(ISD::SRL, LHS, RHS, DAG))
return SS;

if (CRHS) {
RHSVal = CRHS->getZExtValue();

Expand Down
84 changes: 83 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
}

setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
// instead lower to cndmask in SITargetLowering::LowerSELECT().
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
// alignbit.
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);

setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
Custom);

Expand Down Expand Up @@ -6183,6 +6191,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}

// Enable lowering of ROTR for vxi32 types. This is a workaround for a
// regression whereby extra unnecessary instructions were added to codegen
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
// instructions to extract the result from the vector.
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
[[maybe_unused]] EVT VT = Op.getValueType();

assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
VT == MVT::v16i32) &&
"Unexpected ValueType.");

return DAG.UnrollVectorOp(Op.getNode());
}

// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
// wider vector type is legal.
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
Expand Down Expand Up @@ -6374,6 +6396,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerGET_FPENV(Op, DAG);
case ISD::SET_FPENV:
return lowerSET_FPENV(Op, DAG);
case ISD::ROTR:
return lowerROTR(Op, DAG);
}
return SDValue();
}
Expand Down Expand Up @@ -13412,6 +13436,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
}

// Detect identity v2i32 OR and replace with identity source node.
// Specifically an Or that has operands constructed from the same source node
// via extract_vector_elt and build_vector. I.E.
// v2i32 or(
// v2i32 build_vector(
// i32 extract_elt(%IdentitySrc, 0),
// i32 0
// ),
// v2i32 build_vector(
// i32 0,
// i32 extract_elt(%IdentitySrc, 1)
// ) )
// =>
// v2i32 %IdentitySrc

if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
RHS->getOpcode() == ISD::BUILD_VECTOR) {

ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));

// Test for and normalise build vectors.
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {

// Get the extract_vector_element operands.
SDValue LEVE = LHS->getOperand(0);
SDValue REVE = RHS->getOperand(1);

if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
// Check that different elements from the same vector are
// extracted.
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
LEVE->getOperand(1) != REVE->getOperand(1)) {
SDValue IdentitySrc = LEVE.getOperand(0);
return IdentitySrc;
}
}
}
}

if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
return SDValue();

Expand Down Expand Up @@ -13459,7 +13524,7 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);

const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
SelectionDAG &DAG = DCI.DAG;

EVT VT = N->getValueType(0);
Expand All @@ -13469,6 +13534,23 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
return Split;
}

// v2i32 (xor (vselect cc, x, y), K) ->
// (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
// replaced with source modifiers when the select is lowered to CNDMASK.
unsigned Opc = LHS.getOpcode();
if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
(Opc == ISD::SELECT && VT == MVT::i64)) &&
CRHS && CRHS->getAPIntValue().isSignMask()) {
SDValue CC = LHS->getOperand(0);
SDValue TRUE = LHS->getOperand(1);
SDValue FALSE = LHS->getOperand(2);
SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
SDValue XSelect =
DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
return XSelect;
}

// Make sure to apply the 64-bit constant splitting fold before trying to fold
// fneg-like xors into 64-bit select.
if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;

Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -2467,9 +2467,9 @@ def : AMDGPUPatIgnoreCopies <
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
>;

// 64-bit version
foreach vt = [i64, v2i32] in {
def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
(DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
(REG_SEQUENCE VReg_64,
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
Expand All @@ -2478,6 +2478,7 @@ def : AMDGPUPatIgnoreCopies <
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
}

def : AMDGPUPat <
(fcopysign f32:$src0, f32:$src1),
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1841,6 +1841,21 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;

def : GCNPat <
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
(S_AND_B64 SReg_64:$x, SReg_64:$y)
>;

def : GCNPat <
(v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
(S_OR_B64 SReg_64:$x, SReg_64:$y)
>;

def : GCNPat <
(v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
(S_XOR_B64 SReg_64:$x, SReg_64:$y)
>;

// Same as a 32-bit inreg
def : GCNPat<
(i32 (UniformUnaryFrag<sext> i16:$src)),
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1026,9 +1026,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
def : DivergentBinOp<sube, V_SUBB_U32_e32>;

class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
GCNPat<
(DivergentBinFrag<Op> i64:$src0, i64:$src1),
(DivergentBinFrag<Op> vt:$src0, vt:$src1),
(REG_SEQUENCE VReg_64,
(Inst
(i32 (EXTRACT_SUBREG $src0, sub0)),
Expand All @@ -1045,6 +1045,10 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;

def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;

// mul24 w/ 64 bit output.
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
(i64 (Op i32:$src0, i32:$src1)),
Expand Down
Loading
Loading