Skip to content

Commit 741566b

Browse files
committed
PR feedback
Update test prefixes, refactor lower function for just Subgroup Shuffle, clang format
1 parent 83f5dd6 commit 741566b

File tree

2 files changed

+113
-123
lines changed

2 files changed

+113
-123
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 50 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -7280,13 +7280,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
72807280
return DAG.getBitcast(VT, UnrolledLaneOp);
72817281
}
72827282

7283-
// Right now, only subgroup.shuffle implemented, but other
7284-
// future subgroup ops can use this function too
7285-
static SDValue lowerSubgroupOp(const SITargetLowering &TLI, SDNode *N,
7286-
SelectionDAG &DAG) {
7283+
static SDValue lowerSubgroupShuffle(const SITargetLowering &TLI, SDNode *N,
7284+
SelectionDAG &DAG) {
72877285
EVT VT = N->getValueType(0);
72887286
unsigned ValSize = VT.getSizeInBits();
7289-
unsigned IID = N->getConstantOperandVal(0);
72907287
SDLoc SL(N);
72917288

72927289
SDValue Value = N->getOperand(1);
@@ -7310,60 +7307,53 @@ static SDValue lowerSubgroupOp(const SITargetLowering &TLI, SDNode *N,
73107307
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
73117308
};
73127309

7313-
switch (IID) {
7314-
case Intrinsic::amdgcn_subgroup_shuffle:
7315-
if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7316-
// If we can bpermute across the whole wave, then just do that
7317-
SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7318-
MVT::i32, {ShiftedIndex, Value32});
7319-
return DAG.getBitcast(VT, BPermute);
7320-
} else {
7321-
assert(TLI.getSubtarget()->isWave64());
7322-
7323-
// Otherwise, we need to make use of whole wave mode
7324-
SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
7325-
SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
7326-
7327-
// Set inactive lanes to poison
7328-
SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
7329-
MVT::i32, {Value32, PoisonVal});
7330-
SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
7331-
MVT::i32, {ShiftedIndex, PoisonIndex});
7332-
7333-
SDValue Swapped = MakeIntrinsic(Intrinsic::amdgcn_permlane64,
7334-
MVT::i32, {WWMValue});
7335-
7336-
// Get permutation of each half, then we'll select which one to use
7337-
SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7338-
MVT::i32, {WWMIndex, WWMValue});
7339-
SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7340-
MVT::i32, {WWMIndex, Swapped});
7341-
SDValue BPermOtherHalfWWM = MakeIntrinsic(Intrinsic::amdgcn_wwm,
7342-
MVT::i32, {BPermOtherHalf});
7343-
7344-
// Select which side to take the permute from
7345-
SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
7346-
SDValue ThreadIDLo = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7347-
{ThreadIDMask,
7348-
DAG.getTargetConstant(0, SL,
7349-
MVT::i32)});
7350-
SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
7351-
{ThreadIDMask, ThreadIDLo});
7352-
7353-
SDValue SameOrOtherHalf = DAG.getNode(ISD::AND, SL, MVT::i32,
7354-
DAG.getNode(ISD::XOR, SL, MVT::i32,
7355-
ThreadID, Index),
7356-
DAG.getTargetConstant(32, SL,
7357-
MVT::i32));
7358-
SDValue UseSameHalf = DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7359-
DAG.getConstant(0, SL, MVT::i32),
7360-
ISD::SETEQ);
7361-
SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf,
7362-
BPermSameHalf, BPermOtherHalfWWM);
7363-
return DAG.getBitcast(VT, Result);
7364-
}
7365-
default:
7366-
return SDValue();
7310+
if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7311+
// If we can bpermute across the whole wave, then just do that
7312+
SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7313+
{ShiftedIndex, Value32});
7314+
return DAG.getBitcast(VT, BPermute);
7315+
} else {
7316+
assert(TLI.getSubtarget()->isWave64());
7317+
7318+
// Otherwise, we need to make use of whole wave mode
7319+
SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
7320+
SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
7321+
7322+
// Set inactive lanes to poison
7323+
SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7324+
{Value32, PoisonVal});
7325+
SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7326+
{ShiftedIndex, PoisonIndex});
7327+
7328+
SDValue Swapped =
7329+
MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7330+
7331+
// Get permutation of each half, then we'll select which one to use
7332+
SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7333+
MVT::i32, {WWMIndex, WWMValue});
7334+
SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7335+
MVT::i32, {WWMIndex, Swapped});
7336+
SDValue BPermOtherHalfWWM =
7337+
MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7338+
7339+
// Select which side to take the permute from
7340+
SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
7341+
SDValue ThreadIDLo =
7342+
MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7343+
{ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7344+
SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
7345+
{ThreadIDMask, ThreadIDLo});
7346+
7347+
SDValue SameOrOtherHalf =
7348+
DAG.getNode(ISD::AND, SL, MVT::i32,
7349+
DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7350+
DAG.getTargetConstant(32, SL, MVT::i32));
7351+
SDValue UseSameHalf =
7352+
DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7353+
DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7354+
SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7355+
BPermOtherHalfWWM);
7356+
return DAG.getBitcast(VT, Result);
73677357
}
73687358
}
73697359

@@ -10275,7 +10265,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1027510265
return DAG.getMergeValues(Poisons, SDLoc(Op));
1027610266
}
1027710267
case Intrinsic::amdgcn_subgroup_shuffle:
10278-
return lowerSubgroupOp(*this, Op.getNode(), DAG);
10268+
return lowerSubgroupShuffle(*this, Op.getNode(), DAG);
1027910269
default:
1028010270
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1028110271
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll

Lines changed: 63 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,75 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2-
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
3-
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32 %s
3+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32 %s
44

5-
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-64 %s
6-
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-64 %s
5+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64 %s
6+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64 %s
77

88
declare float @llvm.amdgcn.subgroup.shuffle.float(float, i32)
99

1010
define float @test_subgroup_shuffle_scalar(float %val, i32 %idx) {
11-
; GFX11-LABEL: test_subgroup_shuffle_scalar:
12-
; GFX11: ; %bb.0: ; %entry
13-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14-
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
15-
; GFX11-NEXT: ds_bpermute_b32 v0, v1, v0
16-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
17-
; GFX11-NEXT: s_setpc_b64 s[30:31]
11+
; GFX11-W32-LABEL: test_subgroup_shuffle_scalar:
12+
; GFX11-W32: ; %bb.0: ; %entry
13+
; GFX11-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14+
; GFX11-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
15+
; GFX11-W32-NEXT: ds_bpermute_b32 v0, v1, v0
16+
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
17+
; GFX11-W32-NEXT: s_setpc_b64 s[30:31]
1818
;
19-
; GFX12-LABEL: test_subgroup_shuffle_scalar:
20-
; GFX12: ; %bb.0: ; %entry
21-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
22-
; GFX12-NEXT: s_wait_expcnt 0x0
23-
; GFX12-NEXT: s_wait_samplecnt 0x0
24-
; GFX12-NEXT: s_wait_bvhcnt 0x0
25-
; GFX12-NEXT: s_wait_kmcnt 0x0
26-
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
27-
; GFX12-NEXT: ds_bpermute_b32 v0, v1, v0
28-
; GFX12-NEXT: s_wait_dscnt 0x0
29-
; GFX12-NEXT: s_setpc_b64 s[30:31]
19+
; GFX12-W32-LABEL: test_subgroup_shuffle_scalar:
20+
; GFX12-W32: ; %bb.0: ; %entry
21+
; GFX12-W32-NEXT: s_wait_loadcnt_dscnt 0x0
22+
; GFX12-W32-NEXT: s_wait_expcnt 0x0
23+
; GFX12-W32-NEXT: s_wait_samplecnt 0x0
24+
; GFX12-W32-NEXT: s_wait_bvhcnt 0x0
25+
; GFX12-W32-NEXT: s_wait_kmcnt 0x0
26+
; GFX12-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
27+
; GFX12-W32-NEXT: ds_bpermute_b32 v0, v1, v0
28+
; GFX12-W32-NEXT: s_wait_dscnt 0x0
29+
; GFX12-W32-NEXT: s_setpc_b64 s[30:31]
3030
;
31-
; GFX11-64-LABEL: test_subgroup_shuffle_scalar:
32-
; GFX11-64: ; %bb.0: ; %entry
33-
; GFX11-64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34-
; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
35-
; GFX11-64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
36-
; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
37-
; GFX11-64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
38-
; GFX11-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
39-
; GFX11-64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
40-
; GFX11-64-NEXT: s_or_saveexec_b64 s[0:1], -1
41-
; GFX11-64-NEXT: v_permlane64_b32 v2, v0
42-
; GFX11-64-NEXT: ds_bpermute_b32 v2, v3, v2
43-
; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
44-
; GFX11-64-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0
45-
; GFX11-64-NEXT: ds_bpermute_b32 v0, v3, v0
46-
; GFX11-64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v4
47-
; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
48-
; GFX11-64-NEXT: v_xor_b32_e32 v1, v3, v1
49-
; GFX11-64-NEXT: s_waitcnt lgkmcnt(1)
50-
; GFX11-64-NEXT: v_mov_b32_e32 v3, v2
51-
; GFX11-64-NEXT: v_and_b32_e32 v1, 32, v1
52-
; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
53-
; GFX11-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
54-
; GFX11-64-NEXT: s_waitcnt lgkmcnt(0)
55-
; GFX11-64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
56-
; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
57-
; GFX11-64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
58-
; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
59-
; GFX11-64-NEXT: s_waitcnt vmcnt(0)
60-
; GFX11-64-NEXT: s_setpc_b64 s[30:31]
31+
; GFX11-W64-LABEL: test_subgroup_shuffle_scalar:
32+
; GFX11-W64: ; %bb.0: ; %entry
33+
; GFX11-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34+
; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
35+
; GFX11-W64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
36+
; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
37+
; GFX11-W64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
38+
; GFX11-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
39+
; GFX11-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
40+
; GFX11-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
41+
; GFX11-W64-NEXT: v_permlane64_b32 v2, v0
42+
; GFX11-W64-NEXT: ds_bpermute_b32 v2, v3, v2
43+
; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
44+
; GFX11-W64-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0
45+
; GFX11-W64-NEXT: ds_bpermute_b32 v0, v3, v0
46+
; GFX11-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v4
47+
; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
48+
; GFX11-W64-NEXT: v_xor_b32_e32 v1, v3, v1
49+
; GFX11-W64-NEXT: s_waitcnt lgkmcnt(1)
50+
; GFX11-W64-NEXT: v_mov_b32_e32 v3, v2
51+
; GFX11-W64-NEXT: v_and_b32_e32 v1, 32, v1
52+
; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
53+
; GFX11-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
54+
; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0)
55+
; GFX11-W64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
56+
; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
57+
; GFX11-W64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
58+
; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
59+
; GFX11-W64-NEXT: s_waitcnt vmcnt(0)
60+
; GFX11-W64-NEXT: s_setpc_b64 s[30:31]
6161
;
62-
; GFX12-64-LABEL: test_subgroup_shuffle_scalar:
63-
; GFX12-64: ; %bb.0: ; %entry
64-
; GFX12-64-NEXT: s_wait_loadcnt_dscnt 0x0
65-
; GFX12-64-NEXT: s_wait_expcnt 0x0
66-
; GFX12-64-NEXT: s_wait_samplecnt 0x0
67-
; GFX12-64-NEXT: s_wait_bvhcnt 0x0
68-
; GFX12-64-NEXT: s_wait_kmcnt 0x0
69-
; GFX12-64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
70-
; GFX12-64-NEXT: ds_bpermute_b32 v0, v1, v0
71-
; GFX12-64-NEXT: s_wait_dscnt 0x0
72-
; GFX12-64-NEXT: s_setpc_b64 s[30:31]
62+
; GFX12-W64-LABEL: test_subgroup_shuffle_scalar:
63+
; GFX12-W64: ; %bb.0: ; %entry
64+
; GFX12-W64-NEXT: s_wait_loadcnt_dscnt 0x0
65+
; GFX12-W64-NEXT: s_wait_expcnt 0x0
66+
; GFX12-W64-NEXT: s_wait_samplecnt 0x0
67+
; GFX12-W64-NEXT: s_wait_bvhcnt 0x0
68+
; GFX12-W64-NEXT: s_wait_kmcnt 0x0
69+
; GFX12-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
70+
; GFX12-W64-NEXT: ds_bpermute_b32 v0, v1, v0
71+
; GFX12-W64-NEXT: s_wait_dscnt 0x0
72+
; GFX12-W64-NEXT: s_setpc_b64 s[30:31]
7373
entry:
7474
%0 = tail call float @llvm.amdgcn.subgroup.shuffle(float %val, i32 %idx)
7575
ret float %0

0 commit comments

Comments
 (0)