Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 56 additions & 7 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
#include "llvm/Transforms/Utils/LowerAtomic.h"
Expand Down Expand Up @@ -177,6 +178,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);

// We don't want the default expansion of 16-bit ABS since we can
// sign-extend and use the 32-bit ABS operation for 16-bit ABS with SGPRs
setOperationAction(ISD::ABS, {MVT::i8,MVT::i16}, Custom);
setOperationAction(ISD::SUB, {MVT::i8}, Custom);
}

addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
Expand Down Expand Up @@ -974,7 +980,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
Custom);
}

setTargetDAGCombine({ISD::ADD,
setTargetDAGCombine({ISD::ABS,
ISD::ADD,
ISD::PTRADD,
ISD::UADDO_CARRY,
ISD::SUB,
Expand Down Expand Up @@ -6710,6 +6717,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default:
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::ABS:
return lowerABSi16(Op, DAG);
case ISD::BRCOND:
return LowerBRCOND(Op, DAG);
case ISD::RETURNADDR:
Expand Down Expand Up @@ -6773,7 +6782,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:
return lowerDEBUGTRAP(Op, DAG);
case ISD::ABS:
case ISD::FABS:
case ISD::FNEG:
case ISD::FCANONICALIZE:
Expand All @@ -6796,11 +6804,22 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return LowerFP_TO_INT(Op, DAG);
case ISD::SUB:
if (Op.getValueType() == MVT::i8)
if (isNullConstant(Op.getOperand(0)) &&
Op.getOperand(1).getOpcode() == ISD::ABS)
return DAG.getNode(
ISD::TRUNCATE, SDLoc(Op), MVT::i8,
DAG.getNode(ISD::SUB, SDLoc(Op), MVT::i32,
DAG.getConstant(0, SDLoc(Op), MVT::i32),
lowerABSi16(Op.getOperand(1), DAG).getOperand(0)));
else
Comment on lines +6808 to +6816
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks essentially correct, but it's in the wrong place. This is not mandatory lowering, so this should go in PerformDAGCombine. Doesn't this also need the divergence check?

Copy link
Contributor Author

@linuxrocks123 linuxrocks123 Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's also in performSubCombine. No worky there, either. And yes it's missing divergence check, but adding inhibitions to the optimization wouldn't make it worky.

break;
LLVM_FALLTHROUGH;
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::ADD:
case ISD::SUB:
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
Expand Down Expand Up @@ -7272,7 +7291,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
switch (N->getOpcode()) {
switch (N->getOpcode()) {
case ISD::INSERT_VECTOR_ELT: {
if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
Results.push_back(Res);
Expand Down Expand Up @@ -8139,6 +8158,25 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}

// sign-extend and use the 32-bit ABS operation for 16-bit ABS with SGPRs
SDValue SITargetLowering::lowerABSi16(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::ABS &&
"Tried to select abs with non-abs opcode.");
assert((Op.getValueType() == MVT::i16 || Op.getValueType() == MVT::i8) &&
"Tried to select abs i16 lowering with non-i16 type.");

// divergent means will not end up using SGPRs
if (Op->isDivergent())
return SDValue();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the opposite of what you want:

Suggested change
return SDValue();
return Op;

To get the default expansion, you return the original node. Return SDValue() means treat as legal

Copy link
Contributor Author

@linuxrocks123 linuxrocks123 Nov 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arsenm are you sure about that? See https://github.com/linuxrocks123/llvm-project/blob/97c9dddc96f3576fed0762344ce84b2c48e16671/llvm/lib/Target/AMDGPU/SIISelLowering.cpp#L16891. I don't see how -O0 could be working for us if SDValue() means "treat as legal."

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is PerformDAGCombine, which is an optional optimization. This is required lowering. The custom lowering falls back on the default expansion:

if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or it's the other way around


//(abs i16 (i16 op1)) -> (trunc i16 (abs i32 (sext i32 (i16 op1))))
SDValue Src = Op.getOperand(0);
SDLoc DL(Src);
SDValue SExtSrc = DAG.getSExtOrTrunc(Src, DL, MVT::i32);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can slightly generalize this to work on vectors

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does our ISA have a vector equivalent of s_sext_i16 / s_sext_i8?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's v_bfe_i32

SDValue ExtAbs = DAG.getNode(ISD::ABS, DL, MVT::i32, SExtSrc);
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), ExtAbs);
}

SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const {
if (Subtarget->hasApertureRegs()) {
Expand Down Expand Up @@ -16210,13 +16248,20 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
return Folded;
}

if (VT != MVT::i32)
return SDValue();

SDLoc SL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);

if (VT == MVT::i8)
if (isNullConstant(LHS) && RHS->getOpcode() == ISD::ABS)
return DAG.getNode(ISD::TRUNCATE, SL, MVT::i8,
DAG.getNode(ISD::SUB, SL, MVT::i32,
DAG.getConstant(0, SL, MVT::i32),
lowerABSi16(RHS, DAG).getOperand(0)));

if (VT != MVT::i32)
return SDValue();

// sub x, zext (setcc) => usubo_carry x, 0, setcc
// sub x, sext (setcc) => uaddo_carry x, 0, setcc
unsigned Opc = RHS.getOpcode();
Expand Down Expand Up @@ -16855,6 +16900,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();

switch (N->getOpcode()) {
case ISD::ABS:
if (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i8)
return lowerABSi16(SDValue(N,0), DCI.DAG);
break;
case ISD::ADD:
return performAddCombine(N, DCI);
case ISD::PTRADD:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerTrapHsaQueuePtr(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerABSi16(SDValue Op, SelectionDAG &DAG) const;

SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;

Expand Down
Loading
Loading