Skip to content

Commit eadfd8b

Browse files
committed
PR feedback
Update test prefixes, refactor lower function for just Subgroup Shuffle, clang format
1 parent 4467218 commit eadfd8b

File tree

2 files changed

+113
-123
lines changed

2 files changed

+113
-123
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 50 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -7269,13 +7269,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
72697269
return DAG.getBitcast(VT, UnrolledLaneOp);
72707270
}
72717271

7272-
// Right now, only subgroup.shuffle implemented, but other
7273-
// future subgroup ops can use this function too
7274-
static SDValue lowerSubgroupOp(const SITargetLowering &TLI, SDNode *N,
7275-
SelectionDAG &DAG) {
7272+
static SDValue lowerSubgroupShuffle(const SITargetLowering &TLI, SDNode *N,
7273+
SelectionDAG &DAG) {
72767274
EVT VT = N->getValueType(0);
72777275
unsigned ValSize = VT.getSizeInBits();
7278-
unsigned IID = N->getConstantOperandVal(0);
72797276
SDLoc SL(N);
72807277

72817278
SDValue Value = N->getOperand(1);
@@ -7299,60 +7296,53 @@ static SDValue lowerSubgroupOp(const SITargetLowering &TLI, SDNode *N,
72997296
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
73007297
};
73017298

7302-
switch (IID) {
7303-
case Intrinsic::amdgcn_subgroup_shuffle:
7304-
if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7305-
// If we can bpermute across the whole wave, then just do that
7306-
SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7307-
MVT::i32, {ShiftedIndex, Value32});
7308-
return DAG.getBitcast(VT, BPermute);
7309-
} else {
7310-
assert(TLI.getSubtarget()->isWave64());
7311-
7312-
// Otherwise, we need to make use of whole wave mode
7313-
SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
7314-
SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
7315-
7316-
// Set inactive lanes to poison
7317-
SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
7318-
MVT::i32, {Value32, PoisonVal});
7319-
SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
7320-
MVT::i32, {ShiftedIndex, PoisonIndex});
7321-
7322-
SDValue Swapped = MakeIntrinsic(Intrinsic::amdgcn_permlane64,
7323-
MVT::i32, {WWMValue});
7324-
7325-
// Get permutation of each half, then we'll select which one to use
7326-
SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7327-
MVT::i32, {WWMIndex, WWMValue});
7328-
SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7329-
MVT::i32, {WWMIndex, Swapped});
7330-
SDValue BPermOtherHalfWWM = MakeIntrinsic(Intrinsic::amdgcn_wwm,
7331-
MVT::i32, {BPermOtherHalf});
7332-
7333-
// Select which side to take the permute from
7334-
SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
7335-
SDValue ThreadIDLo = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7336-
{ThreadIDMask,
7337-
DAG.getTargetConstant(0, SL,
7338-
MVT::i32)});
7339-
SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
7340-
{ThreadIDMask, ThreadIDLo});
7341-
7342-
SDValue SameOrOtherHalf = DAG.getNode(ISD::AND, SL, MVT::i32,
7343-
DAG.getNode(ISD::XOR, SL, MVT::i32,
7344-
ThreadID, Index),
7345-
DAG.getTargetConstant(32, SL,
7346-
MVT::i32));
7347-
SDValue UseSameHalf = DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7348-
DAG.getConstant(0, SL, MVT::i32),
7349-
ISD::SETEQ);
7350-
SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf,
7351-
BPermSameHalf, BPermOtherHalfWWM);
7352-
return DAG.getBitcast(VT, Result);
7353-
}
7354-
default:
7355-
return SDValue();
7299+
if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7300+
// If we can bpermute across the whole wave, then just do that
7301+
SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7302+
{ShiftedIndex, Value32});
7303+
return DAG.getBitcast(VT, BPermute);
7304+
} else {
7305+
assert(TLI.getSubtarget()->isWave64());
7306+
7307+
// Otherwise, we need to make use of whole wave mode
7308+
SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
7309+
SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
7310+
7311+
// Set inactive lanes to poison
7312+
SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7313+
{Value32, PoisonVal});
7314+
SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7315+
{ShiftedIndex, PoisonIndex});
7316+
7317+
SDValue Swapped =
7318+
MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7319+
7320+
// Get permutation of each half, then we'll select which one to use
7321+
SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7322+
MVT::i32, {WWMIndex, WWMValue});
7323+
SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7324+
MVT::i32, {WWMIndex, Swapped});
7325+
SDValue BPermOtherHalfWWM =
7326+
MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7327+
7328+
// Select which side to take the permute from
7329+
SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
7330+
SDValue ThreadIDLo =
7331+
MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7332+
{ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7333+
SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
7334+
{ThreadIDMask, ThreadIDLo});
7335+
7336+
SDValue SameOrOtherHalf =
7337+
DAG.getNode(ISD::AND, SL, MVT::i32,
7338+
DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7339+
DAG.getTargetConstant(32, SL, MVT::i32));
7340+
SDValue UseSameHalf =
7341+
DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7342+
DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7343+
SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7344+
BPermOtherHalfWWM);
7345+
return DAG.getBitcast(VT, Result);
73567346
}
73577347
}
73587348

@@ -10264,7 +10254,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1026410254
return DAG.getMergeValues(Poisons, SDLoc(Op));
1026510255
}
1026610256
case Intrinsic::amdgcn_subgroup_shuffle:
10267-
return lowerSubgroupOp(*this, Op.getNode(), DAG);
10257+
return lowerSubgroupShuffle(*this, Op.getNode(), DAG);
1026810258
default:
1026910259
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1027010260
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.subgroup.shuffle.ll

Lines changed: 63 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,75 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2-
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
3-
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-W32 %s
3+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-W32 %s
44

5-
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-64 %s
6-
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-64 %s
5+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11-W64 %s
6+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-W64 %s
77

88
declare float @llvm.amdgcn.subgroup.shuffle.float(float, i32)
99

1010
define float @test_subgroup_shuffle_scalar(float %val, i32 %idx) {
11-
; GFX11-LABEL: test_subgroup_shuffle_scalar:
12-
; GFX11: ; %bb.0: ; %entry
13-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14-
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
15-
; GFX11-NEXT: ds_bpermute_b32 v0, v1, v0
16-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
17-
; GFX11-NEXT: s_setpc_b64 s[30:31]
11+
; GFX11-W32-LABEL: test_subgroup_shuffle_scalar:
12+
; GFX11-W32: ; %bb.0: ; %entry
13+
; GFX11-W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14+
; GFX11-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
15+
; GFX11-W32-NEXT: ds_bpermute_b32 v0, v1, v0
16+
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
17+
; GFX11-W32-NEXT: s_setpc_b64 s[30:31]
1818
;
19-
; GFX12-LABEL: test_subgroup_shuffle_scalar:
20-
; GFX12: ; %bb.0: ; %entry
21-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
22-
; GFX12-NEXT: s_wait_expcnt 0x0
23-
; GFX12-NEXT: s_wait_samplecnt 0x0
24-
; GFX12-NEXT: s_wait_bvhcnt 0x0
25-
; GFX12-NEXT: s_wait_kmcnt 0x0
26-
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
27-
; GFX12-NEXT: ds_bpermute_b32 v0, v1, v0
28-
; GFX12-NEXT: s_wait_dscnt 0x0
29-
; GFX12-NEXT: s_setpc_b64 s[30:31]
19+
; GFX12-W32-LABEL: test_subgroup_shuffle_scalar:
20+
; GFX12-W32: ; %bb.0: ; %entry
21+
; GFX12-W32-NEXT: s_wait_loadcnt_dscnt 0x0
22+
; GFX12-W32-NEXT: s_wait_expcnt 0x0
23+
; GFX12-W32-NEXT: s_wait_samplecnt 0x0
24+
; GFX12-W32-NEXT: s_wait_bvhcnt 0x0
25+
; GFX12-W32-NEXT: s_wait_kmcnt 0x0
26+
; GFX12-W32-NEXT: v_lshlrev_b32_e32 v1, 2, v1
27+
; GFX12-W32-NEXT: ds_bpermute_b32 v0, v1, v0
28+
; GFX12-W32-NEXT: s_wait_dscnt 0x0
29+
; GFX12-W32-NEXT: s_setpc_b64 s[30:31]
3030
;
31-
; GFX11-64-LABEL: test_subgroup_shuffle_scalar:
32-
; GFX11-64: ; %bb.0: ; %entry
33-
; GFX11-64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34-
; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
35-
; GFX11-64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
36-
; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
37-
; GFX11-64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
38-
; GFX11-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
39-
; GFX11-64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
40-
; GFX11-64-NEXT: s_or_saveexec_b64 s[0:1], -1
41-
; GFX11-64-NEXT: v_permlane64_b32 v2, v0
42-
; GFX11-64-NEXT: ds_bpermute_b32 v2, v3, v2
43-
; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
44-
; GFX11-64-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0
45-
; GFX11-64-NEXT: ds_bpermute_b32 v0, v3, v0
46-
; GFX11-64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v4
47-
; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
48-
; GFX11-64-NEXT: v_xor_b32_e32 v1, v3, v1
49-
; GFX11-64-NEXT: s_waitcnt lgkmcnt(1)
50-
; GFX11-64-NEXT: v_mov_b32_e32 v3, v2
51-
; GFX11-64-NEXT: v_and_b32_e32 v1, 32, v1
52-
; GFX11-64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
53-
; GFX11-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
54-
; GFX11-64-NEXT: s_waitcnt lgkmcnt(0)
55-
; GFX11-64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
56-
; GFX11-64-NEXT: s_xor_saveexec_b64 s[0:1], -1
57-
; GFX11-64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
58-
; GFX11-64-NEXT: s_mov_b64 exec, s[0:1]
59-
; GFX11-64-NEXT: s_waitcnt vmcnt(0)
60-
; GFX11-64-NEXT: s_setpc_b64 s[30:31]
31+
; GFX11-W64-LABEL: test_subgroup_shuffle_scalar:
32+
; GFX11-W64: ; %bb.0: ; %entry
33+
; GFX11-W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34+
; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
35+
; GFX11-W64-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
36+
; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
37+
; GFX11-W64-NEXT: v_lshlrev_b32_e32 v3, 2, v1
38+
; GFX11-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
39+
; GFX11-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
40+
; GFX11-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
41+
; GFX11-W64-NEXT: v_permlane64_b32 v2, v0
42+
; GFX11-W64-NEXT: ds_bpermute_b32 v2, v3, v2
43+
; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
44+
; GFX11-W64-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0
45+
; GFX11-W64-NEXT: ds_bpermute_b32 v0, v3, v0
46+
; GFX11-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v4
47+
; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
48+
; GFX11-W64-NEXT: v_xor_b32_e32 v1, v3, v1
49+
; GFX11-W64-NEXT: s_waitcnt lgkmcnt(1)
50+
; GFX11-W64-NEXT: v_mov_b32_e32 v3, v2
51+
; GFX11-W64-NEXT: v_and_b32_e32 v1, 32, v1
52+
; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
53+
; GFX11-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
54+
; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0)
55+
; GFX11-W64-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
56+
; GFX11-W64-NEXT: s_xor_saveexec_b64 s[0:1], -1
57+
; GFX11-W64-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
58+
; GFX11-W64-NEXT: s_mov_b64 exec, s[0:1]
59+
; GFX11-W64-NEXT: s_waitcnt vmcnt(0)
60+
; GFX11-W64-NEXT: s_setpc_b64 s[30:31]
6161
;
62-
; GFX12-64-LABEL: test_subgroup_shuffle_scalar:
63-
; GFX12-64: ; %bb.0: ; %entry
64-
; GFX12-64-NEXT: s_wait_loadcnt_dscnt 0x0
65-
; GFX12-64-NEXT: s_wait_expcnt 0x0
66-
; GFX12-64-NEXT: s_wait_samplecnt 0x0
67-
; GFX12-64-NEXT: s_wait_bvhcnt 0x0
68-
; GFX12-64-NEXT: s_wait_kmcnt 0x0
69-
; GFX12-64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
70-
; GFX12-64-NEXT: ds_bpermute_b32 v0, v1, v0
71-
; GFX12-64-NEXT: s_wait_dscnt 0x0
72-
; GFX12-64-NEXT: s_setpc_b64 s[30:31]
62+
; GFX12-W64-LABEL: test_subgroup_shuffle_scalar:
63+
; GFX12-W64: ; %bb.0: ; %entry
64+
; GFX12-W64-NEXT: s_wait_loadcnt_dscnt 0x0
65+
; GFX12-W64-NEXT: s_wait_expcnt 0x0
66+
; GFX12-W64-NEXT: s_wait_samplecnt 0x0
67+
; GFX12-W64-NEXT: s_wait_bvhcnt 0x0
68+
; GFX12-W64-NEXT: s_wait_kmcnt 0x0
69+
; GFX12-W64-NEXT: v_lshlrev_b32_e32 v1, 2, v1
70+
; GFX12-W64-NEXT: ds_bpermute_b32 v0, v1, v0
71+
; GFX12-W64-NEXT: s_wait_dscnt 0x0
72+
; GFX12-W64-NEXT: s_setpc_b64 s[30:31]
7373
entry:
7474
%0 = tail call float @llvm.amdgcn.subgroup.shuffle(float %val, i32 %idx)
7575
ret float %0

0 commit comments

Comments
 (0)