-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Add SchedGroupBarrier::PACK for packed math #132432
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Change-Id: I3f87324db3c085b46b2c37c8eb1a5b30f29f6858
|
@llvm/pr-subscribers-backend-amdgpu Author: Jeffrey Byrnes (jrbyrnes) ChangesFor some architectures, these instructions have distinct hardware behavior (e.g. MI300 non coissue instructions). Moreover, this gives an additional level of granularity to control VALU instructions. Full diff: https://github.com/llvm/llvm-project/pull/132432.diff 4 Files Affected:
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index ab507e3714ebb..13f1ab5483a21 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1347,6 +1347,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
- 0x0100: All DS read instructions may be scheduled accoss sched_barrier.
- 0x0200: All DS write instructions may be scheduled across sched_barrier.
- 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier.
+ - 0x0800: All Packed Arithmetic (e.g. V_PK_MOV, V_DOT, etc) instructions may be scheduled across sched_barrier.
llvm.amdgcn.sched.group.barrier Creates schedule groups with specific properties to create custom scheduling
pipelines. The ordering between groups is enforced by the instruction scheduler.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index bbd262748d680..b8ee898b2058e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -75,8 +75,9 @@ enum class SchedGroupMask {
DS_READ = 1u << 8,
DS_WRITE = 1u << 9,
TRANS = 1u << 10,
+ PACK = 1u << 11,
ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
- DS_READ | DS_WRITE | TRANS,
+ DS_READ | DS_WRITE | TRANS | PACK,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
@@ -2414,7 +2415,8 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = true;
else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
- TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
+ TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)
+ && !TII->isVOP3P(MI))
Result = true;
else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
@@ -2455,6 +2457,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
TII->isTRANS(MI))
Result = true;
+ else if (((SGMask & SchedGroupMask::PACK) != SchedGroupMask::NONE) &&
+ TII->isVOP3P(MI) && !TII->isMFMAorWMMA(MI))
+ Result = true;
+
LLVM_DEBUG(
dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
<< (Result ? " could classify " : " unable to classify ") << MI);
@@ -2634,15 +2640,17 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
// allowed past the SCHED_BARRIER.
SchedGroupMask InvertedMask = ~Mask;
- // ALU implies VALU, SALU, MFMA, TRANS.
+ // ALU implies VALU, SALU, MFMA, TRANS, PACK.
if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
- ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
- // VALU, SALU, MFMA, TRANS implies ALU.
+ ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS &
+ ~SchedGroupMask::PACK;
+ // VALU, SALU, MFMA, TRANS, PACK implies ALU.
else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
(InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
(InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
- (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
+ (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE ||
+ (InvertedMask & SchedGroupMask::PACK) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::ALU;
// VMEM implies VMEM_READ, VMEM_WRITE.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 37f335561a52c..ab1d6ca5f243f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -1625,6 +1625,189 @@ entry:
ret void
}
+
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_PACK_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in1) #0 {
+; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: v_and_b32_e32 v6, 0x3ff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 7, v6
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_u32_e32 v7, s0, v8
+; GCN-NEXT: s_movk_i32 s0, 0xff88
+; GCN-NEXT: v_mad_i32_i24 v9, v6, s0, v7
+; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1
+; GCN-NEXT: ds_read_b64 v[4:5], v9 offset:5120
+; GCN-NEXT: ds_read_b128 a[28:31], v7 offset:112
+; GCN-NEXT: ds_read_b128 a[24:27], v7 offset:96
+; GCN-NEXT: ds_read_b128 a[20:23], v7 offset:80
+; GCN-NEXT: ds_read_b128 a[16:19], v7 offset:64
+; GCN-NEXT: ds_read_b128 a[0:3], v7
+; GCN-NEXT: ds_read_b128 a[4:7], v7 offset:16
+; GCN-NEXT: ds_read_b128 a[8:11], v7 offset:32
+; GCN-NEXT: ds_read_b128 a[12:15], v7 offset:48
+; GCN-NEXT: s_waitcnt lgkmcnt(8)
+; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0]
+; GCN-NEXT: v_add_u32_e32 v4, 0xc00, v9
+; GCN-NEXT: v_lshl_add_u32 v10, v6, 3, v4
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5
+; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1]
+; GCN-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6
+; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5]
+; GCN-NEXT: ds_read_b64 v[4:5], v10 offset:3584
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1]
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-NEXT: v_add_u32_e32 v0, s1, v8
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
+; GCN-NEXT: ds_write_b128 v0, a[0:3]
+; GCN-NEXT: s_endpgm
+;
+; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_PACK_MFMA:
+; EXACTCUTOFF: ; %bb.0: ; %entry
+; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; EXACTCUTOFF-NEXT: v_and_b32_e32 v6, 0x3ff, v0
+; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v8, 7, v6
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v7, s0, v8
+; EXACTCUTOFF-NEXT: s_movk_i32 s0, 0xff88
+; EXACTCUTOFF-NEXT: v_mad_i32_i24 v9, v6, s0, v7
+; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset1:1
+; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v9 offset:5120
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v7 offset:112
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v7 offset:96
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v7 offset:80
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v7 offset:64
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v7
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v7 offset:16
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v7 offset:32
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v7 offset:48
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
+; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], 0 op_sel_hi:[1,1,0]
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0xc00, v9
+; EXACTCUTOFF-NEXT: v_lshl_add_u32 v10, v6, 3, v4
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[4:7], v10 offset0:4 offset1:5
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[4:5], v[2:3], v[4:5], v[0:1]
+; EXACTCUTOFF-NEXT: ds_read2st64_b64 v[0:3], v9 offset0:3 offset1:6
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_nop 0
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v5, a[0:31]
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[4:5]
+; EXACTCUTOFF-NEXT: ds_read_b64 v[4:5], v10 offset:3584
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_nop 0
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], v[0:1]
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000800) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_nop 1
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v8
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_nop 7
+; EXACTCUTOFF-NEXT: s_nop 7
+; EXACTCUTOFF-NEXT: s_nop 1
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3]
+; EXACTCUTOFF-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
+ %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
+ %el.0.addr = getelementptr <2 x float>, ptr addrspace(3) %in, i32 %idx
+ %el.0 = load <2 x float>, ptr addrspace(3) %el.0.addr
+ %el.1.addr = getelementptr <2 x float>, ptr addrspace(3) %el.0.addr, i32 64
+ %el.1 = load <2 x float>, ptr addrspace(3) %el.1.addr
+ %el.2.addr = getelementptr <2 x float>, ptr addrspace(3) %el.1.addr, i32 128
+ %el.2 = load <2 x float>, ptr addrspace(3) %el.2.addr
+ %el.3.addr = getelementptr <2 x float>, ptr addrspace(3) %el.2.addr, i32 192
+ %el.3 = load <2 x float>, ptr addrspace(3) %el.3.addr
+ %el.4.addr = getelementptr <2 x float>, ptr addrspace(3) %el.3.addr, i32 256
+ %el.4 = load <2 x float>, ptr addrspace(3) %el.4.addr
+ %el.5.addr = getelementptr <2 x float>, ptr addrspace(3) %el.4.addr, i32 %idx
+ %el.5 = load <2 x float>, ptr addrspace(3) %el.5.addr
+ %el.6.addr = getelementptr <2 x float>, ptr addrspace(3) %el.5.addr, i32 64
+ %el.6 = load <2 x float>, ptr addrspace(3) %el.6.addr
+ %el.7.addr = getelementptr <2 x float>, ptr addrspace(3) %el.6.addr, i32 128
+ %el.7 = load <2 x float>, ptr addrspace(3) %el.7.addr
+ %v0 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.0, <2 x float> %el.4, <2 x float> <float 0.0, float 0.0>)
+ %v1 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.1, <2 x float> %el.5, <2 x float> %v0)
+ %v2 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.2, <2 x float> %el.6, <2 x float> %v1)
+ %v3 = tail call contract <2 x float> @llvm.fma.v2f32(<2 x float> %el.3, <2 x float> %el.7, <2 x float> %v2)
+ %op0 = extractelement <2 x float> %v0, i32 0
+ %op1 = extractelement <2 x float> %v0, i32 1
+ %op2 = extractelement <2 x float> %v1, i32 0
+ %op3 = extractelement <2 x float> %v1, i32 1
+ %op4 = extractelement <2 x float> %v2, i32 0
+ %op5 = extractelement <2 x float> %v2, i32 1
+ %op6 = extractelement <2 x float> %v3, i32 0
+ %op7 = extractelement <2 x float> %v3, i32 1
+ %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op0, float %op1, <32 x float> %load.0, i32 0, i32 0, i32 0)
+ %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op2, float %op3, <32 x float> %mai.0, i32 0, i32 0, i32 0)
+ %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op4, float %op5, <32 x float> %mai.1, i32 0, i32 0, i32 0)
+ %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %op6, float %op7, <32 x float> %mai.2, i32 0, i32 0, i32 0)
+ ; 1 PACK
+ call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
+ ; 1 MFMA
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ ; 1 PACK
+ call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
+ ; 1 MFMA
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ ; 1 PACK
+ call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
+ ; 1 MFMA
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ ; 1 PACK
+ call void @llvm.amdgcn.sched.group.barrier(i32 2048, i32 1, i32 0)
+ ; 1 MFMA
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ %store.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
+ store <32 x float> %mai.3, ptr addrspace(3) %store.addr
+ ret void
+}
+
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll b/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll
index c20dbba42ccd4..14762dc329d9d 100644
--- a/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll
+++ b/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll
@@ -5,7 +5,7 @@
-; Inverted 1008: 01111110000
+; Inverted 1008: 001111110000
; GCN: After Inverting, SchedGroup Mask: 1008
define amdgpu_kernel void @invert1() #0 {
entry:
@@ -14,8 +14,8 @@ entry:
ret void
}
-; Inverted 2044: 11111111100
-; GCN: After Inverting, SchedGroup Mask: 2044
+; Inverted 4092: 111111111100
+; GCN: After Inverting, SchedGroup Mask: 4092
define amdgpu_kernel void @invert2() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 2) #1
@@ -23,8 +23,8 @@ entry:
ret void
}
-; Inverted 2042: 11111111010
-; GCN: After Inverting, SchedGroup Mask: 2042
+; Inverted 4090: 111111111010
+; GCN: After Inverting, SchedGroup Mask: 4090
define amdgpu_kernel void @invert4() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 4) #1
@@ -32,8 +32,8 @@ entry:
ret void
}
-; Inverted 2038: 11111110110
-; GCN: After Inverting, SchedGroup Mask: 2038
+; Inverted 4086: 111111110110
+; GCN: After Inverting, SchedGroup Mask: 4086
define amdgpu_kernel void @invert8() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 8) #1
@@ -41,8 +41,8 @@ entry:
ret void
}
-; Inverted 1935: 11110001111
-; GCN: After Inverting, SchedGroup Mask: 1935
+; Inverted 3983: 111110001111
+; GCN: After Inverting, SchedGroup Mask: 3983
define amdgpu_kernel void @invert16() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 16) #1
@@ -50,8 +50,8 @@ entry:
ret void
}
-; Inverted 1999: 11111001111
-; GCN: After Inverting, SchedGroup Mask: 1999
+; Inverted 4047: 111111001111
+; GCN: After Inverting, SchedGroup Mask: 4047
define amdgpu_kernel void @invert32() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 32) #1
@@ -59,8 +59,8 @@ entry:
ret void
}
-; Inverted 1967: 11110101111
-; GCN: After Inverting, SchedGroup Mask: 1967
+; Inverted 4015: 111110101111
+; GCN: After Inverting, SchedGroup Mask: 4015
define amdgpu_kernel void @invert64() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 64) #1
@@ -68,8 +68,8 @@ entry:
ret void
}
-; Inverted 1151: 10001111111
-; GCN: After Inverting, SchedGroup Mask: 1151
+; Inverted 3199: 110001111111
+; GCN: After Inverting, SchedGroup Mask: 3199
define amdgpu_kernel void @invert128() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 128) #1
@@ -77,8 +77,8 @@ entry:
ret void
}
-; Inverted 1663: 11001111111
-; GCN: After Inverting, SchedGroup Mask: 1663
+; Inverted 3711: 111001111111
+; GCN: After Inverting, SchedGroup Mask: 3711
define amdgpu_kernel void @invert256() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 256) #1
@@ -86,8 +86,8 @@ entry:
ret void
}
-; Inverted 1407: 10101111111
-; GCN: After Inverting, SchedGroup Mask: 1407
+; Inverted 3455: 110101111111
+; GCN: After Inverting, SchedGroup Mask: 3455
define amdgpu_kernel void @invert512() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 512) #1
@@ -95,8 +95,8 @@ entry:
ret void
}
-; Inverted 1022: 01111111110
-; GCN: After Inverting, SchedGroup Mask: 1022
+; Inverted 3070: 101111111110
+; GCN: After Inverting, SchedGroup Mask: 3070
define amdgpu_kernel void @invert1024() #0 {
entry:
call void @llvm.amdgcn.sched.barrier(i32 1024) #1
@@ -104,6 +104,16 @@ entry:
ret void
}
+; Inverted 2046: 011111111110
+; GCN: After Inverting, SchedGroup Mask: 2046
+define amdgpu_kernel void @invert2048() #0 {
+entry:
+ call void @llvm.amdgcn.sched.barrier(i32 2048) #1
+ call void @llvm.amdcn.s.nop(i16 0) #1
+ ret void
+}
+
+
declare void @llvm.amdgcn.sched.barrier(i32) #1
declare void @llvm.amdcn.s.nop(i16) #1
|
You can test this locally with the following command:git-clang-format --diff 77ac5a2d57f207f173734cee2dfe652a2c2ae418 9dbb2023ba20d6989db8da23772611950053c248 --extensions cpp -- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cppView the diff from clang-format here.diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index b8ee898b20..93d0983f7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2415,8 +2415,8 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = true;
else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
- TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)
- && !TII->isVOP3P(MI))
+ TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI) &&
+ !TII->isVOP3P(MI))
Result = true;
else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
|
Change-Id: I375236dffc1289857c419baf3172a51c46249324
Change-Id: I4cbb79b168f451e2decd0775657dafba0243faab
|
I started drafting a Tablegen approach, but it got pretty messy since different architectures are treated differently, and the non-coissue instructions span multi instruction format files. This seems to be the cleaner approach. Still waiting on some info on the list for gfx950 |
|
NON_COISSUE is eve more nonspecifically broad |
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The instruction type is not fundamental to the pressure problem you are trying to solve
|
Went a different direction |
For some architectures, these instructions have distinct hardware behavior (e.g. MI300 non coissue instructions). Moreover, this gives an additional level of granularity to control VALU instructions.