Skip to content

Commit 83f5dd6

Browse files
committed
[AMDGPU] Add new llvm.amdgcn.subgroup.shuffle intrinsic
This intrinsic will be useful for implementing the OpGroupNonUniformShuffle operation in the SPIR-V reference Signed-off-by: Domenic Nutile <[email protected]>
1 parent e6fc654 commit 83f5dd6

File tree

4 files changed

+178
-0
lines changed

4 files changed

+178
-0
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2718,6 +2718,15 @@ def int_amdgcn_call_whole_wave:
27182718
llvm_vararg_ty], // The arguments to the callee.
27192719
[IntrConvergent]>;
27202720

2721+
// <result>
2722+
// llvm.amdgcn.subgroup.shuffle <value> <id>
2723+
// value and result can be any scalar of floating-point, integer,
2724+
// or Boolean types, but must be the same type
2725+
def int_amdgcn_subgroup_shuffle :
2726+
Intrinsic<[llvm_any_ty], // return types
2727+
[LLVMMatchType<0>, llvm_i32_ty], // arg types
2728+
[IntrConvergent, IntrNoMem, IntrNoFree, IntrWillReturn, IntrNoCallback]>; // flags
2729+
27212730
//===----------------------------------------------------------------------===//
27222731
// CI+ Intrinsics
27232732
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,6 +1894,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18941894
bool requiresWaitsBeforeSystemScopeStores() const {
18951895
return RequiresWaitsBeforeSystemScopeStores;
18961896
}
1897+
1898+
bool supportsWaveWideBPermute() const {
1899+
return ((getGeneration() == AMDGPUSubtarget::GFX12) || isWave32());
1900+
}
18971901
};
18981902

18991903
class GCNUserSGPRUsageInfo {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7280,6 +7280,93 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
72807280
return DAG.getBitcast(VT, UnrolledLaneOp);
72817281
}
72827282

7283+
// Right now, only subgroup.shuffle implemented, but other
7284+
// future subgroup ops can use this function too
7285+
static SDValue lowerSubgroupOp(const SITargetLowering &TLI, SDNode *N,
7286+
SelectionDAG &DAG) {
7287+
EVT VT = N->getValueType(0);
7288+
unsigned ValSize = VT.getSizeInBits();
7289+
unsigned IID = N->getConstantOperandVal(0);
7290+
SDLoc SL(N);
7291+
7292+
SDValue Value = N->getOperand(1);
7293+
SDValue Index = N->getOperand(2);
7294+
7295+
// ds_bpermute requires index to be multiplied by 4
7296+
SDValue ShiftAmount = DAG.getTargetConstant(2, SL, MVT::i32);
7297+
SDValue ShiftedIndex = DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index,
7298+
ShiftAmount);
7299+
7300+
// Intrinsics will require i32 to operate on
7301+
SDValue Value32 = Value;
7302+
if ((ValSize != 32) || (VT.isFloatingPoint()))
7303+
Value32 = DAG.getBitcast(MVT::i32, Value);
7304+
7305+
auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7306+
SmallVector<SDValue> IntrinArgs) -> SDValue {
7307+
SmallVector<SDValue> Operands(1);
7308+
Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
7309+
Operands.append(IntrinArgs);
7310+
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
7311+
};
7312+
7313+
switch (IID) {
7314+
case Intrinsic::amdgcn_subgroup_shuffle:
7315+
if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7316+
// If we can bpermute across the whole wave, then just do that
7317+
SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7318+
MVT::i32, {ShiftedIndex, Value32});
7319+
return DAG.getBitcast(VT, BPermute);
7320+
} else {
7321+
assert(TLI.getSubtarget()->isWave64());
7322+
7323+
// Otherwise, we need to make use of whole wave mode
7324+
SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
7325+
SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
7326+
7327+
// Set inactive lanes to poison
7328+
SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
7329+
MVT::i32, {Value32, PoisonVal});
7330+
SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
7331+
MVT::i32, {ShiftedIndex, PoisonIndex});
7332+
7333+
SDValue Swapped = MakeIntrinsic(Intrinsic::amdgcn_permlane64,
7334+
MVT::i32, {WWMValue});
7335+
7336+
// Get permutation of each half, then we'll select which one to use
7337+
SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7338+
MVT::i32, {WWMIndex, WWMValue});
7339+
SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7340+
MVT::i32, {WWMIndex, Swapped});
7341+
SDValue BPermOtherHalfWWM = MakeIntrinsic(Intrinsic::amdgcn_wwm,
7342+
MVT::i32, {BPermOtherHalf});
7343+
7344+
// Select which side to take the permute from
7345+
SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
7346+
SDValue ThreadIDLo = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7347+
{ThreadIDMask,
7348+
DAG.getTargetConstant(0, SL,
7349+
MVT::i32)});
7350+
SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
7351+
{ThreadIDMask, ThreadIDLo});
7352+
7353+
SDValue SameOrOtherHalf = DAG.getNode(ISD::AND, SL, MVT::i32,
7354+
DAG.getNode(ISD::XOR, SL, MVT::i32,
7355+
ThreadID, Index),
7356+
DAG.getTargetConstant(32, SL,
7357+
MVT::i32));
7358+
SDValue UseSameHalf = DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7359+
DAG.getConstant(0, SL, MVT::i32),
7360+
ISD::SETEQ);
7361+
SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf,
7362+
BPermSameHalf, BPermOtherHalfWWM);
7363+
return DAG.getBitcast(VT, Result);
7364+
}
7365+
default:
7366+
return SDValue();
7367+
}
7368+
}
7369+
72837370
void SITargetLowering::ReplaceNodeResults(SDNode *N,
72847371
SmallVectorImpl<SDValue> &Results,
72857372
SelectionDAG &DAG) const {
@@ -10187,6 +10274,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1018710274
Poisons.push_back(DAG.getPOISON(ValTy));
1018810275
return DAG.getMergeValues(Poisons, SDLoc(Op));
1018910276
}
10277+
case Intrinsic::amdgcn_subgroup_shuffle:
10278+
return lowerSubgroupOp(*this, Op.getNode(), DAG);
1019010279
default:
1019110280
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1019210281
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
3+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
4+
5+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-64 %s
6+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-64 %s
7+
8+
declare float @llvm.amdgcn.subgroup.shuffle.float(float, i32)
9+
10+
define float @test_subgroup_shuffle_scalar(float %val, i32 %idx) {
11+
; GFX11-LABEL: test_subgroup_shuffle_scalar:
12+
; GFX11: ; %bb.0: ; %entry
13+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14+
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
15+
; GFX11-NEXT: ds_bpermute_b32 v0, v1, v0
16+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
17+
; GFX11-NEXT: s_setpc_b64 s[30:31]
18+
;
19+
; GFX12-LABEL: test_subgroup_shuffle_scalar:
20+
; GFX12: ; %bb.0: ; %entry
21+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
22+
; GFX12-NEXT: s_wait_expcnt 0x0
23+
; GFX12-NEXT: s_wait_samplecnt 0x0
24+
; GFX12-NEXT: s_wait_bvhcnt 0x0
25+
; GFX12-NEXT: s_wait_kmcnt 0x0
26+
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
27+
; GFX12-NEXT: ds_bpermute_b32 v0, v1, v0
28+
; GFX12-NEXT: s_wait_dscnt 0x0
29+
; GFX12-NEXT: s_setpc_b64 s[30:31]
30+
;
31+
; GFX11-64-LABEL: test_subgroup_shuffle_scalar:
32+
; GFX11-64: ; %bb.0: ; %entry
33+
; GFX11-64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34+
; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
35+
; GFX11-64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
36+
; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
37+
; GFX11-64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
38+
; GFX11-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
39+
; GFX11-64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
40+
; GFX11-64-NEXT: s_or_saveexec_b64 s[0:1], -1
41+
; GFX11-64-NEXT: v_permlane64_b32 v2, v0
42+
; GFX11-64-NEXT: ds_bpermute_b32 v2, v3, v2
43+
; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
44+
; GFX11-64-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0
45+
; GFX11-64-NEXT: ds_bpermute_b32 v0, v3, v0
46+
; GFX11-64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v4
47+
; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
48+
; GFX11-64-NEXT: v_xor_b32_e32 v1, v3, v1
49+
; GFX11-64-NEXT: s_waitcnt lgkmcnt(1)
50+
; GFX11-64-NEXT: v_mov_b32_e32 v3, v2
51+
; GFX11-64-NEXT: v_and_b32_e32 v1, 32, v1
52+
; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
53+
; GFX11-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
54+
; GFX11-64-NEXT: s_waitcnt lgkmcnt(0)
55+
; GFX11-64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
56+
; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
57+
; GFX11-64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
58+
; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
59+
; GFX11-64-NEXT: s_waitcnt vmcnt(0)
60+
; GFX11-64-NEXT: s_setpc_b64 s[30:31]
61+
;
62+
; GFX12-64-LABEL: test_subgroup_shuffle_scalar:
63+
; GFX12-64: ; %bb.0: ; %entry
64+
; GFX12-64-NEXT: s_wait_loadcnt_dscnt 0x0
65+
; GFX12-64-NEXT: s_wait_expcnt 0x0
66+
; GFX12-64-NEXT: s_wait_samplecnt 0x0
67+
; GFX12-64-NEXT: s_wait_bvhcnt 0x0
68+
; GFX12-64-NEXT: s_wait_kmcnt 0x0
69+
; GFX12-64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
70+
; GFX12-64-NEXT: ds_bpermute_b32 v0, v1, v0
71+
; GFX12-64-NEXT: s_wait_dscnt 0x0
72+
; GFX12-64-NEXT: s_setpc_b64 s[30:31]
73+
entry:
74+
%0 = tail call float @llvm.amdgcn.subgroup.shuffle(float %val, i32 %idx)
75+
ret float %0
76+
}

0 commit comments

Comments
 (0)