Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
aa4e757
[AMDGPU] add support for i64 readlane
vikramRH Apr 10, 2024
b895dd5
add support for i64 readfirstlane and writelane intrinsics
vikramRH Apr 12, 2024
dfa3219
Fix issues with writelane expansion
vikramRH Apr 15, 2024
fcc0a1a
code refactor and add patterns for f64
vikramRH Apr 18, 2024
4e71a06
clang format
vikramRH Apr 18, 2024
c7ff0e5
fix corner case with regkill and add readlane tests
vikramRH Apr 18, 2024
d6a8ce4
update builtin handling for readlane and readfirstlane
vikramRH Apr 19, 2024
15cbd90
add and update tests, fixes to writelane src0 imm handling
vikramRH Apr 19, 2024
776a4c6
address review comments
Apr 22, 2024
82da530
Implement lowering in legalizer for legal types
vikramRH May 2, 2024
14fcf44
refactor/improve GIsel lowering, added new tests
vikramRH May 6, 2024
d0610c4
Review comments, refactor GISel Impl
vikramRH May 9, 2024
9233833
clang-format
vikramRH May 9, 2024
5feef44
Merge branch 'main' into rw_lane_64
vikramRH May 13, 2024
993a630
Review comments, improve pointer handling with GISel
vikramRH May 13, 2024
556dda2
align comments
vikramRH May 13, 2024
b59873e
Review comments
vikramRH May 15, 2024
edd3179
fix type profile
vikramRH May 16, 2024
a75eb6b
remove spurious comma
vikramRH May 17, 2024
52d7020
review comments, move pointer tests to new files
vikramRH May 18, 2024
66ca57c
remove bitcasts, avoid special handling of pointers in gisel
vikramRH May 23, 2024
c3e512c
Review comments, updated AMDGPUUsage.rst
vikramRH May 27, 2024
72af37c
preserve legel 32-bit pieces, update usage doc
vikramRH May 30, 2024
2e4c5bc
Refactor GIsel lowering
vikramRH May 30, 2024
67e19e5
fix documentation mess
vikramRH May 30, 2024
cba2b1d
review comments
vikramRH May 30, 2024
26223c8
handle comment
vikramRH May 31, 2024
429fb0f
Review comments
vikramRH May 31, 2024
ec7b5c1
test for convergence related crash
vikramRH Jun 3, 2024
3d9cf2e
Update convergence-laneops-xfail.ll
vikramRH Jun 3, 2024
c015040
Merge branch 'main' into rw_lane_64
vikramRH Jun 6, 2024
482f380
update convergence related failure tests
vikramRH Jun 12, 2024
2b4cabb
Merge branch 'main' into rw_lane_64
vikramRH Jun 12, 2024
1a33cbc
revert targte specific SDNodes, handle convergence tokens in SDAG
vikramRH Jun 14, 2024
cfa659d
remove spurious new lines
vikramRH Jun 14, 2024
be90ba6
review comments
vikramRH Jun 17, 2024
104121f
Merge branch 'main' into rw_lane_64
vikramRH Jun 22, 2024
31b8838
update builtin CodeGen
vikramRH Jun 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -308,14 +308,14 @@ void test_ds_bpermute(global int* out, int a, int b)
}

// CHECK-LABEL: @test_readfirstlane
// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
void test_readfirstlane(global int* out, int a)
{
*out = __builtin_amdgcn_readfirstlane(a);
}

// CHECK-LABEL: @test_readlane
// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
void test_readlane(global int* out, int a, int b)
{
*out = __builtin_amdgcn_readlane(a, b);
Expand Down
6 changes: 0 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -904,9 +904,6 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
return false;
case AMDGPUISD::SETCC: // ballot-style instruction
return true;
case AMDGPUISD::READFIRSTLANE:
case AMDGPUISD::READLANE:
return true;
}
return false;
}
Expand Down Expand Up @@ -5511,9 +5508,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LDS)
NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
NODE_NAME_CASE(READLANE)
NODE_NAME_CASE(READFIRSTLANE)
NODE_NAME_CASE(WRITELANE)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(LOAD_D16_HI)
Expand Down
4 changes: 0 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -558,10 +558,6 @@ enum NodeType : unsigned {
FPTRUNC_ROUND_UPWARD,
FPTRUNC_ROUND_DOWNWARD,

READLANE,
READFIRSTLANE,
WRITELANE,

DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
LOAD_D16_HI,
Expand Down
27 changes: 0 additions & 27 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -342,22 +342,6 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",

def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;

def AMDGPUReadfirstlaneOp : SDTypeProfile<1, 1, [
SDTCisSameAs<0, 1>
]>;

def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [
SDTCisSameAs<0, 1>, SDTCisInt<2>
]>;

def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>
]>;

def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp, [SDNPOptInGlue]>;
def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp, [SDNPOptInGlue]>;
def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp, [SDNPOptInGlue]>;

// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
Expand Down Expand Up @@ -523,15 +507,4 @@ def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
(AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;

def AMDGPUreadlane : PatFrags<(ops node:$src0, node:$src1),
[(int_amdgcn_readlane node:$src0, node:$src1),
(AMDGPUreadlane_impl node:$src0, node:$src1)]>;

def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
[(int_amdgcn_readfirstlane node:$src),
(AMDGPUreadfirstlane_impl node:$src)]>;

def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
(AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;

89 changes: 75 additions & 14 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6088,23 +6088,44 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
unsigned ValSize = VT.getSizeInBits();
unsigned IntrinsicID = N->getConstantOperandVal(0);
SDValue Src0 = N->getOperand(1);
unsigned IID = N->getConstantOperandVal(0);
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);

auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2,
MVT VT) -> SDValue {
return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
: Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
: DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
SDValue Src2, MVT ValT) -> SDValue {
SmallVector<SDValue, 8> Operands;
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
Operands.push_back(Src0);
break;
case Intrinsic::amdgcn_readlane:
Operands.push_back(Src0);
Operands.push_back(Src1);
break;
case Intrinsic::amdgcn_writelane:
Operands.push_back(Src0);
Operands.push_back(Src1);
Operands.push_back(Src2);
break;
}

if (SDNode *GL = N->getGluedNode()) {
assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
GL = GL->getOperand(0).getNode();
Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
SDValue(GL, 0)));
}

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
};

SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IntrinsicID == Intrinsic::amdgcn_readlane ||
IntrinsicID == Intrinsic::amdgcn_writelane) {
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
Src1 = N->getOperand(2);
if (IntrinsicID == Intrinsic::amdgcn_writelane)
if (IID == Intrinsic::amdgcn_writelane)
Src2 = N->getOperand(3);
}

Expand All @@ -6129,13 +6150,55 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
if (ValSize % 32 != 0)
return SDValue();

auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
EVT VT = N->getValueType(0);
unsigned NE = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
SmallVector<SDValue, 8> Scalars;
unsigned NumOperands = N->getNumOperands();
SmallVector<SDValue, 4> Operands(NumOperands);
SDNode *GL = N->getGluedNode();

if (GL) {
// only handle convegrencectrl_glue
assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
}

for (unsigned i = 0; i != NE; ++i) {
for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
++j) {
SDValue Operand = N->getOperand(j);
EVT OperandVT = Operand.getValueType();
if (OperandVT.isVector()) {
// A vector operand; extract a single element.
EVT OperandEltVT = OperandVT.getVectorElementType();
Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
Operand, DAG.getVectorIdxConstant(i, SL));
} else {
// A scalar operand; just use it as is.
Operands[j] = Operand;
}
}

if (GL)
Operands[NumOperands - 1] =
DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
SDValue(GL->getOperand(0).getNode(), 0));

Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
}

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
return DAG.getBuildVector(VecVT, SL, Scalars);
};

if (VT.isVector()) {
switch (MVT::SimpleValueType EltTy =
VT.getVectorElementType().getSimpleVT().SimpleTy) {
case MVT::i32:
case MVT::f32: {
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
return DAG.UnrollVectorOp(LaneOp.getNode());
return unrollLaneOp(LaneOp.getNode());
}
case MVT::i16:
case MVT::f16:
Expand Down Expand Up @@ -6170,7 +6233,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src2 = DAG.getBitcast(VecVT, Src2);

SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
return DAG.getBitcast(VT, UnrolledLaneOp);
}

Expand Down Expand Up @@ -15931,8 +15994,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case AMDGPUISD::BUFFER_ATOMIC_FMAX:
// Target-specific read-modify-write atomics are sources of divergence.
return true;
case AMDGPUISD::WRITELANE:
return true;
default:
if (auto *A = dyn_cast<AtomicSDNode>(N)) {
// Generic read-modify-write atomics are sources of divergence.
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3383,7 +3383,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
(i32 (AMDGPUreadfirstlane (i32 imm:$src))),
(i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
}

foreach vt = Reg32Types.types in {
def : GCNPat<(vt (AMDGPUreadfirstlane (vt VRegOrLdsSrc_32:$src0))),
def : GCNPat<(vt (int_amdgcn_readfirstlane (vt VRegOrLdsSrc_32:$src0))),
(V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0))
>;
}
Expand Down
12 changes: 7 additions & 5 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -779,19 +779,21 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
} // End isCommutable = 1

// These are special and do not read the exec mask.
let isConvergent = 1, Uses = []<Register> in {
def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,[]>;
let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in {
def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>;
let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>;
def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> {
let IsInvalidSingleUseProducer = 1;
}
} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1

foreach vt = Reg32Types.types in {
def : GCNPat<(vt (AMDGPUreadlane vt:$src0, i32:$src1)),
def : GCNPat<(vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
(V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1)
>;

def : GCNPat<(vt (AMDGPUwritelane vt:$src0, i32:$src1, vt:$src2)),
def : GCNPat<(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
(V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2)
>;
}
Expand Down
65 changes: 65 additions & 0 deletions llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s

; CHECK-LABEL: name: basic_readfirstlane_i64
; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
define i64 @basic_readfirstlane_i64(i64 %src, i1 %cond) #0 {
entry:
%t = call token @llvm.experimental.convergence.anchor()
%x = add i64 %src, 1
br i1 %cond, label %then, label %else

then:
%r = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %x) [ "convergencectrl"(token %t) ]
br label %else

else:
%p = phi i64 [%r, %then], [%x, %entry]
ret i64 %p
}

; CHECK-LABEL: name: basic_readlane_i64
; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
; ISEL: {{.*}} = V_READLANE_B32 {{.*}}, implicit [[TOKEN]]
; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
; ISEL: {{.*}} = V_READLANE_B32 {{.*}}, implicit [[TOKEN]]
define i64 @basic_readlane_i64(i64 %src, i32 %lane, i1 %cond) #0 {
entry:
%t = call token @llvm.experimental.convergence.anchor()
%x = add i64 %src, 1
br i1 %cond, label %then, label %else

then:
%r = call i64 @llvm.amdgcn.readlane.i64(i64 %x, i32 %lane) [ "convergencectrl"(token %t) ]
br label %else

else:
%p = phi i64 [%r, %then], [%x, %entry]
ret i64 %p
}

; CHECK-LABEL: name: basic_writelane_i64
; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
; ISEL: {{.*}} = V_WRITELANE_B32 {{.*}}, implicit [[TOKEN]]
; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
; ISEL: {{.*}} = V_WRITELANE_B32 {{.*}}, implicit [[TOKEN]]
define i64 @basic_writelane_i64(i64 %src, i1 %cond, i32 %lane, ptr addrspace(1) %out) #0 {
entry:
%old = load i64, ptr addrspace(1) %out
%t = call token @llvm.experimental.convergence.anchor()
%x = add i64 %src, 1
br i1 %cond, label %then, label %else

then:
%r = call i64 @llvm.amdgcn.writelane.i64(i64 %x, i32 %lane, i64 %old) [ "convergencectrl"(token %t) ]
br label %else

else:
%p = phi i64 [%r, %then], [%x, %entry]
ret i64 %p
}

This file was deleted.

Loading