-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Treat XDL ops as TRANS on gfx12 #130692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: None (sstipano) ChangesFull diff: https://github.com/llvm/llvm-project/pull/130692.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index b25619b4c5422..1ff7589339a14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -16,6 +16,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
using namespace llvm;
@@ -51,12 +52,15 @@ class AMDGPUInsertDelayAlu {
enum DelayType { VALU, TRANS, SALU, OTHER };
// Get the delay type for an instruction with the specified TSFlags.
- static DelayType getDelayType(uint64_t TSFlags) {
- if (TSFlags & SIInstrFlags::TRANS)
+ DelayType getDelayType(MachineInstr &MI) {
+ if (SIInstrInfo::isTRANS(MI))
return TRANS;
- if (TSFlags & SIInstrFlags::VALU)
+ // XDL ops are treated the same as TRANS.
+ if (SII->isXDL(MI))
+ return TRANS;
+ if (SIInstrInfo::isVALU(MI))
return VALU;
- if (TSFlags & SIInstrFlags::SALU)
+ if (SIInstrInfo::isSALU(MI))
return SALU;
return OTHER;
}
@@ -343,7 +347,7 @@ class AMDGPUInsertDelayAlu {
continue;
}
- DelayType Type = getDelayType(MI.getDesc().TSFlags);
+ DelayType Type = getDelayType(MI);
if (instructionWaitsForVALU(MI)) {
// Forget about all outstanding VALU delays.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ae285d069d876..5095cecc8732f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10262,6 +10262,8 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
+ if (AMDGPU::isGFX12(ST))
+ return isWMMA(MI) || isSWMMAC(MI) || isDOT(MI);
if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) ||
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
index 1e9ef07ba7542..b8394faafbd8a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -20,7 +20,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
@@ -64,7 +64,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
@@ -102,7 +102,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
@@ -140,7 +140,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
@@ -184,7 +184,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32>
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
@@ -280,7 +280,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
@@ -324,7 +324,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
@@ -368,7 +368,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
@@ -412,7 +412,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index 311e76b9bb2b0..5d5c30126979f 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -20,7 +20,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
@@ -64,7 +64,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
@@ -102,7 +102,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
@@ -140,7 +140,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
@@ -184,7 +184,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32>
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
@@ -280,7 +280,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
@@ -324,7 +324,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
@@ -368,7 +368,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
@@ -412,7 +412,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
|
| // Get the delay type for an instruction with the specified TSFlags. | ||
| static DelayType getDelayType(uint64_t TSFlags) { | ||
| if (TSFlags & SIInstrFlags::TRANS) | ||
| DelayType getDelayType(MachineInstr &MI) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should still be a "static" function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isXDL is not static, so this can't be as well.
|
|
||
| bool SIInstrInfo::isXDL(const MachineInstr &MI) const { | ||
| unsigned Opcode = MI.getOpcode(); | ||
| if (AMDGPU::isGFX12(ST)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As you may have already known, a WMMA does not necessarily be a XDL for downstream branches.
Is AMDGPU::isGFX12(ST) the same as AMDGPU::isGFX12Plus(ST)? So we need to be careful for downstream.
changpeng
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, Thanks
|
Section 5.8. "ALU Instruction Software Scheduling" in the public RDNA4 ISA says
Is this list incomplete? |
|
Public RDNA4 ISA is correct, I made a mistake. Sorry for the confusion. |
No description provided.