-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Avoid put implicit_def into bundle that break reg's liveness #142563
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: None (Shoreshen) ChangesFix #139102 Cause:
Patch is 88.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142563.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index eb0977b92d5ab..1f3e549b0e27f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -145,7 +145,7 @@ class SIInsertHardClauses {
// It's safe to treat the rest as illegal.
if (MI.getOpcode() == AMDGPU::S_NOP)
return HARDCLAUSE_INTERNAL;
- if (MI.isMetaInstruction())
+ if (MI.isMetaInstruction() && MI.getOpcode() != AMDGPU::IMPLICIT_DEF)
return HARDCLAUSE_IGNORE;
return HARDCLAUSE_ILLEGAL;
}
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index efdc55b8e68be..48f84286e9214 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -184,7 +184,8 @@ bool SIPostRABundler::run(MachineFunction &MF) {
if (I->getNumExplicitDefs() != 0)
Defs.insert(I->defs().begin()->getReg());
++ClauseLength;
- } else if (!I->isMetaInstruction()) {
+ } else if (!I->isMetaInstruction() ||
+ I->getOpcode() == AMDGPU::IMPLICIT_DEF) {
// Allow meta instructions in between bundle candidates, but do not
// start or end a bundle on one.
//
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 44abfd272be88..1ab1b5b3b202d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -2,11 +2,11 @@
; FIXME: Currently block machineinstr verifier due to SI BUNDLE pass break physical register liveness. Should remove when the issue is fixed up
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs=0 < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs=0 < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs=0 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) {
; SI-LABEL: bitcast_v32i32_to_v32f32:
@@ -4334,9 +4334,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -4438,18 +4443,45 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr41
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
@@ -4479,43 +4511,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr46
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB12_2
; VI-NEXT: ; %bb.1: ; %cmp.false
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
@@ -4694,6 +4695,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB12_4
; VI-NEXT: ; %bb.3: ; %cmp.true
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32
; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -5293,9 +5295,16 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -5389,14 +5398,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; kill: killed $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr38
@@ -5428,15 +5435,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -5484,11 +5491,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB12_2
@@ -42059,9 +42061,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -42163,18 +42170,45 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr41
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
@@ -42204,43 +42238,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr46
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB36_2
; VI-NEXT: ; %bb.1: ; %cmp.false
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
@@ -42419,6 +42422,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB36_4
; VI-NEXT: ; %bb.3: ; %cmp.true
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_f32_e32 v32, 1.0, v32
; VI-NEXT: v_add_f32_e32 v31, 1.0, v31
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -43018,9 +43022,16 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -43114,14 +43125,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; kill: killed $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr38
@@ -43153,15 +43162,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-N...
[truncated]
|
|
It's a workaround, not a proper fix. |
Hi @jayfoad , if I'm not wrong the implicit_def inside a bundle will not add definition of registers. So if we want to fix this, we cannot leave it inside the bundle |
Your patch will generate worse code in some cases (because it won't be able to form hard clauses in some cases) and it only avoids one specific case of this problem. What if some other meta instruction with an implicit-def appears in a bundle? We will have the same problem again. What if some non-meta instruction with an implicit-def appears in a bundle? We will have the same problem again. A better fix would be to understand how |
|
Hi @jayfoad , from my perspective of view, if we want:
We cannot include implicit_def into the bundle because inside the bundle implicit def will not count for defined register and any use of this register in the latter program will cause the verification error. There will be bundle break by not including implicit_def. If it causes the program generate worse code, then the only possibility would be we move the implicit_def in front of the bundle. However, SIPostRABundler and SIInsertHardClauses are all post RA pass, which means they are not SSA. So if we move the implicit_def, we need to make sure that there is no overlap liveness for the defining register of implicit_def. For other meta instruction with an implicit-def or other non-meta instruction with an implicit-def can be within a bundle, I think I need some time to summarize it since I'm kind of a new bee for llvm codegen. But we can solve the current problem first. In summary:
|
You can and should include implicit_def in the bundle. It does not matter that it's an implicit_def, the liveness of that def should work the same as any other real instruction inside the bundle.
You don't really need to move anything. I guess you could handle it this way, but I think it would be more difficult than just treating the implicit_def like any other load def in the bundle |
|
Hi @arsenm , by "cannot include implicit_def in the BUNDLE" is based on if we do not change the logic of machineinst verifer. However, personally I think its little bit complicated because it not only create use undefined reg error for instructions inside the bundle, but also for BUNDLE inst itself since it add reg def & use to the BUNDLE inst. In order to solve error for BUNDLE inst itself, we can do:
Since the def & use of BUNDLE inst is hand written in different backends, it requires us to change all backend's BUNDLE creation logic if we want to use method 1. If we use method 2, we give up opportunity to early reject bad code for BUNDLE. So my preference is to adjust AMDGPU backend only for safety and its easier to modify. However if other method is the proper way of fixing or there maybe other methods can do better, I'm willing to implement it. Thanks |
No, the generic code in The idea is that the operands on the bundle should have the same overall effect as the sequence of instructions-and-operands inside the bundle. This gets a complicated when a def operand of an instruction early in the bundle gets partially redefined by a def of an overlapping register later in the bundle. |
|
Hi @jayfoad @arsenm , after read through I following the logic from MachineVerifier's liveness check for register that if 1 of the subreg is in localdef, then its locally defined. This may cause problem that if we only define $vgpr2 but not $vgpr3. Currently I'm following the logic of MachineVerifier, but if needed, I think I can use regunit to avoid this problem. Thanks for the kind help and illustration~~ |
| SmallSetVector<MCRegUnit, 32> LocalDefsP, Register Reg, | ||
| const TargetRegisterInfo *TRI) { | ||
| if (Reg.isPhysical()) { | ||
| for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id()))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id()))) | |
| for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))) |
| Bundle.prepend(MIB); | ||
|
|
||
| SmallSetVector<Register, 32> LocalDefs; | ||
| SmallSetVector<MCRegUnit, 32> LocalDefsP; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RegUnits are supposed to be treated as a bitmask, using a SmallSetVector for them is unnecessarily heavy
| if (!MO.isDead() && Reg.isPhysical()) | ||
| LocalDefs.insert_range(TRI->subregs(Reg)); | ||
| if (!MO.isDead() && Reg.isPhysical()) { | ||
| for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id()))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id()))) | |
| for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))) |
| LocalDefs.insert_range(TRI->subregs(Reg)); | ||
| if (!MO.isDead() && Reg.isPhysical()) { | ||
| for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id()))) | ||
| LocalDefsP.insert(Unit); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LocalDefsP.insert(TRI->regunits()) probably works, but this should be a BitVector (e.g. see LiveRegUnits' usage)
Co-authored-by: Matt Arsenault <[email protected]>
| return DebugLoc(); | ||
| } | ||
|
|
||
| static bool containReg(SmallSetVector<Register, 32> LocalDefsV, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| static bool containReg(SmallSetVector<Register, 32> LocalDefsV, | |
| static bool containsReg(SmallSetVector<Register, 32> LocalDefsV, |
| for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) | ||
| if (!LocalDefsP[Unit]) | ||
| return false; | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) | |
| if (!LocalDefsP[Unit]) | |
| return false; | |
| for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) { | |
| if (!LocalDefsP[Unit]) | |
| return false; | |
| } | |
| continue; | ||
|
|
||
| if (LocalDefs.contains(Reg)) { | ||
| if (containReg(LocalDefs, LocalDefsP, Reg, TRI)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (containReg(LocalDefs, LocalDefsP, Reg, TRI)) { | |
| if (containsReg(LocalDefs, LocalDefsP, Reg, TRI)) { |
| return DebugLoc(); | ||
| } | ||
|
|
||
| static bool containReg(SmallSetVector<Register, 32> LocalDefsV, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Document that LocalDefsV is for virtual registers, and LocalDefsP for physregs
Cause:
implicit_definside bundle does not count for define of reg in machineinst verifierimplicit_defwill cause relative reg not define, result inBad machine code: Using an undefined physical registerin the machineinst verifierFixes #139102