Skip to content

Conversation

@Shoreshen
Copy link
Contributor

@Shoreshen Shoreshen commented Jun 3, 2025

Cause:

  1. implicit_def inside bundle does not count for define of reg in machineinst verifier
  2. Including implicit_def will cause relative reg not define, result in Bad machine code: Using an undefined physical register in the machineinst verifier

Fixes #139102

@llvmbot
Copy link
Member

llvmbot commented Jun 3, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: None (Shoreshen)

Changes

Fix #139102

Cause:

  1. implicit_def inside bundle does not count for define of reg in machineinst verifier
  2. Including implicit_def will cause relative reg not define, result in Bad machine code: Using an undefined physical register in the machineinst verifier

Patch is 88.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142563.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp (+1-1)
  • (modified) llvm/lib/Target/AMDGPU/SIPostRABundler.cpp (+2-1)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+475-465)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+23-23)
  • (added) llvm/test/CodeGen/AMDGPU/bundle-break-phy-liveness.mir (+31)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index eb0977b92d5ab..1f3e549b0e27f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -145,7 +145,7 @@ class SIInsertHardClauses {
     // It's safe to treat the rest as illegal.
     if (MI.getOpcode() == AMDGPU::S_NOP)
       return HARDCLAUSE_INTERNAL;
-    if (MI.isMetaInstruction())
+    if (MI.isMetaInstruction() && MI.getOpcode() != AMDGPU::IMPLICIT_DEF)
       return HARDCLAUSE_IGNORE;
     return HARDCLAUSE_ILLEGAL;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index efdc55b8e68be..48f84286e9214 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -184,7 +184,8 @@ bool SIPostRABundler::run(MachineFunction &MF) {
           if (I->getNumExplicitDefs() != 0)
             Defs.insert(I->defs().begin()->getReg());
           ++ClauseLength;
-        } else if (!I->isMetaInstruction()) {
+        } else if (!I->isMetaInstruction() ||
+                   I->getOpcode() == AMDGPU::IMPLICIT_DEF) {
           // Allow meta instructions in between bundle candidates, but do not
           // start or end a bundle on one.
           //
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 44abfd272be88..1ab1b5b3b202d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -2,11 +2,11 @@
 
 ; FIXME: Currently block machineinstr verifier due to SI BUNDLE pass break physical register liveness. Should remove when the issue is fixed up
 
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs=0 < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs=0 < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs=0 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i32_to_v32f32:
@@ -4334,9 +4334,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -4438,18 +4443,45 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
@@ -4479,43 +4511,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB12_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
@@ -4694,6 +4695,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB12_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v32, vcc, 3, v32
 ; VI-NEXT:    v_add_u32_e32 v31, vcc, 3, v31
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -5293,9 +5295,16 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -5389,14 +5398,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr56
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
@@ -5428,15 +5435,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -5484,11 +5491,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; kill: killed $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_2
@@ -42059,9 +42061,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -42163,18 +42170,45 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr56
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr44
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr58
@@ -42204,43 +42238,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB36_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
@@ -42419,6 +42422,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB36_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_f32_e32 v32, 1.0, v32
 ; VI-NEXT:    v_add_f32_e32 v31, 1.0, v31
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -43018,9 +43022,16 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -43114,14 +43125,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; kill: killed $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr56
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
@@ -43153,15 +43162,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-N...
[truncated]

@Shoreshen Shoreshen requested review from arsenm, jayfoad and shiltian June 3, 2025 09:15
@jayfoad
Copy link
Contributor

jayfoad commented Jun 3, 2025

It's a workaround, not a proper fix.

@Shoreshen
Copy link
Contributor Author

It's a workaround, not a proper fix.

Hi @jayfoad , if I'm not wrong the implicit_def inside a bundle will not add definition of registers. So if we want to fix this, we cannot leave it inside the bundle

@jayfoad
Copy link
Contributor

jayfoad commented Jun 4, 2025

It's a workaround, not a proper fix.

Hi jayfoad , if I'm not wrong the implicit_def inside a bundle will not add definition of registers. So if we want to fix this, we cannot leave it inside the bundle

Your patch will generate worse code in some cases (because it won't be able to form hard clauses in some cases) and it only avoids one specific case of this problem.

What if some other meta instruction with an implicit-def appears in a bundle? We will have the same problem again.

What if some non-meta instruction with an implicit-def appears in a bundle? We will have the same problem again.

A better fix would be to understand how finalizeBundle works, and fix it to do the right thing with implicits-defs.

@Shoreshen
Copy link
Contributor Author

Hi @jayfoad , from my perspective of view, if we want:

  1. Solve the use undefined problem
  2. Do not alter the logic of machine instruction verifier

We cannot include implicit_def into the bundle because inside the bundle implicit def will not count for defined register and any use of this register in the latter program will cause the verification error.

There will be bundle break by not including implicit_def. If it causes the program generate worse code, then the only possibility would be we move the implicit_def in front of the bundle.

However, SIPostRABundler and SIInsertHardClauses are all post RA pass, which means they are not SSA. So if we move the implicit_def, we need to make sure that there is no overlap liveness for the defining register of implicit_def.

For other meta instruction with an implicit-def or other non-meta instruction with an implicit-def can be within a bundle, I think I need some time to summarize it since I'm kind of a new bee for llvm codegen. But we can solve the current problem first.

In summary:

  1. To prevent both implicit_def inside the bundle and break bundle result in worse code, what we can do is:
  • if the register it is defining doesn't overlap with other instruction, move the implicit_def into the front of the bundle
  • if not, break the bundle and may result in worse code
  1. For other instructions that has implicit_def that can occur in the bundle, I'll read the code and try to understand the logic. We can handle all of them in this PR or solve the current problem first in this PR

@arsenm
Copy link
Contributor

arsenm commented Jun 5, 2025

We cannot include implicit_def into the bundle because inside the bundle implicit def will not count for defined register and any use of this register in the latter program will cause the verification error.

You can and should include implicit_def in the bundle. It does not matter that it's an implicit_def, the liveness of that def should work the same as any other real instruction inside the bundle.

if the register it is defining doesn't overlap with other instruction, move the implicit_def into the front of the bundle

You don't really need to move anything. I guess you could handle it this way, but I think it would be more difficult than just treating the implicit_def like any other load def in the bundle

@Shoreshen
Copy link
Contributor Author

Hi @arsenm , by "cannot include implicit_def in the BUNDLE" is based on if we do not change the logic of machineinst verifer.

However, personally I think its little bit complicated because it not only create use undefined reg error for instructions inside the bundle, but also for BUNDLE inst itself since it add reg def & use to the BUNDLE inst.

In order to solve error for BUNDLE inst itself, we can do:

  1. Exclude the def & use inside the BUNDLE from the BUNDLE inst's def & use. or
  2. Change the logic of verifier that do not check the reg liveness for BUNDLE inst (if use undefined reg occurs for BUNDLE itself, we need to further check inside instructions that it is not implicit defined, which act like normal reg liveness check)

Since the def & use of BUNDLE inst is hand written in different backends, it requires us to change all backend's BUNDLE creation logic if we want to use method 1.

If we use method 2, we give up opportunity to early reject bad code for BUNDLE.

So my preference is to adjust AMDGPU backend only for safety and its easier to modify. However if other method is the proper way of fixing or there maybe other methods can do better, I'm willing to implement it. Thanks

@jayfoad
Copy link
Contributor

jayfoad commented Jun 5, 2025

Since the def & use of BUNDLE inst is hand written in different backends, it requires us to change all backend's BUNDLE creation logic if we want to use method 1.

No, the generic code in finalizeBundle does this. It looks at the operands of the instructions in the bundle, and adds corresponding operands to the BUNDLE instruction.

The idea is that the operands on the bundle should have the same overall effect as the sequence of instructions-and-operands inside the bundle. This gets a complicated when a def operand of an instruction early in the bundle gets partially redefined by a def of an overlapping register later in the bundle.

@Shoreshen
Copy link
Contributor Author

Hi @jayfoad @arsenm , after read through finalizeBundle it seems like localdef cannot recognize that $vgpr2_vgpr3 was defined for their subreg $vgpr2 and $vgpr3.

I following the logic from MachineVerifier's liveness check for register that if 1 of the subreg is in localdef, then its locally defined.

This may cause problem that if we only define $vgpr2 but not $vgpr3. Currently I'm following the logic of MachineVerifier, but if needed, I think I can use regunit to avoid this problem.

Thanks for the kind help and illustration~~

SmallSetVector<MCRegUnit, 32> LocalDefsP, Register Reg,
const TargetRegisterInfo *TRI) {
if (Reg.isPhysical()) {
for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id())))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id())))
for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())))

Bundle.prepend(MIB);

SmallSetVector<Register, 32> LocalDefs;
SmallSetVector<MCRegUnit, 32> LocalDefsP;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RegUnits are supposed to be treated as a bitmask, using a SmallSetVector for them is unnecessarily heavy

if (!MO.isDead() && Reg.isPhysical())
LocalDefs.insert_range(TRI->subregs(Reg));
if (!MO.isDead() && Reg.isPhysical()) {
for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id())))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id())))
for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())))

LocalDefs.insert_range(TRI->subregs(Reg));
if (!MO.isDead() && Reg.isPhysical()) {
for (MCRegUnit Unit : TRI->regunits(MCRegister::from(Reg.id())))
LocalDefsP.insert(Unit);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LocalDefsP.insert(TRI->regunits()) probably works, but this should be a BitVector (e.g. see LiveRegUnits' usage)

return DebugLoc();
}

static bool containReg(SmallSetVector<Register, 32> LocalDefsV,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
static bool containReg(SmallSetVector<Register, 32> LocalDefsV,
static bool containsReg(SmallSetVector<Register, 32> LocalDefsV,

Comment on lines +101 to +104
for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
if (!LocalDefsP[Unit])
return false;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
if (!LocalDefsP[Unit])
return false;
for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
if (!LocalDefsP[Unit])
return false;
}

continue;

if (LocalDefs.contains(Reg)) {
if (containReg(LocalDefs, LocalDefsP, Reg, TRI)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (containReg(LocalDefs, LocalDefsP, Reg, TRI)) {
if (containsReg(LocalDefs, LocalDefsP, Reg, TRI)) {

return DebugLoc();
}

static bool containReg(SmallSetVector<Register, 32> LocalDefsV,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Document that LocalDefsV is for virtual registers, and LocalDefsP for physregs

@Shoreshen Shoreshen merged commit db96363 into llvm:main Aug 13, 2025
9 checks passed
gbossu added a commit that referenced this pull request Aug 14, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[AMDGPU] Bundling in SIInsertHardClauses can break physical register liveness

4 participants