From 2b542e36632104cbca9f53d1ce3a0e41c2047055 Mon Sep 17 00:00:00 2001 From: bababuck Date: Tue, 8 Apr 2025 13:32:15 -0700 Subject: [PATCH 1/3] Revert "AMDGPU: Disable machine verifier in failing test" This reverts commit bf2d1c46072a0461cb3ddcaefcafeccc2637995e. --- llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll index 936118750ff10..e130ce0a45467 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; FIXME: Fails expensive checks, should re-enable verifier, see issue #130884 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs=0 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s ; Test that we can form v_pk_mov_b32 in certain shuffles when they ; originate from 32-bit physreg copy sequences. From 86789d194e65c030b54842f1f62982489541fcb4 Mon Sep 17 00:00:00 2001 From: bababuck Date: Mon, 14 Apr 2025 17:53:30 -0700 Subject: [PATCH 2/3] Add MIR test targeting #130884 --- .../AMDGPU/machine-scheduler-undef-reorder.mir | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/machine-scheduler-undef-reorder.mir diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-undef-reorder.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-undef-reorder.mir new file mode 100644 index 0000000000000..eefed7d379580 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-undef-reorder.mir @@ -0,0 +1,17 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX900 %s + +--- +name: test_undef_reorder +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr2 + $vgpr5 = COPY $vgpr2 + $vgpr4 = COPY $vgpr2 + undef %18.sub1:vreg_64 = COPY killed $vgpr4 + %18.sub0:vreg_64 = COPY killed $vgpr5 + %19: vreg_64 = COPY %18 + S_ENDPGM 0 +... From cf5b0f7059abf489eeb5b3d6c96da0ac66f01b4a Mon Sep 17 00:00:00 2001 From: bababuck Date: Mon, 14 Apr 2025 17:53:30 -0700 Subject: [PATCH 3/3] Properly handle undef statements in ScheduleDagMI::moveInstruction If a subregister def is moved such that it is not live prior to that instruction, mark as Undef. Also remove any Undef flag from the next def of this register. Fixes #130884. --- llvm/lib/CodeGen/MachineScheduler.cpp | 30 ++++++++++++++++++- .../machine-scheduler-sink-trivial-remats.mir | 14 ++++----- .../machine-scheduler-undef-reorder.mir | 10 +++++++ .../AMDGPU/shufflevector-physreg-copy.ll | 8 ++--- 4 files changed, 50 insertions(+), 12 deletions(-) diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 0c3ffb1bbaa6f..07d3a37dc9ce9 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -955,9 +955,37 @@ void ScheduleDAGMI::moveInstruction( BB->splice(InsertPos, BB, MI); // Update LiveIntervals - if (LIS) + if (LIS) { LIS->handleMove(*MI, /*UpdateFlags=*/true); + // Mark the move instruction definition as undef if needed, clear isUndef + // from the following instruction. + for (MachineOperand &MO : MI->operands()) { + if (MO.isReg() && MO.isDef() && MO.getSubReg() != 0) { + SlotIndex Index = LIS->getInstructionIndex(*MI); + LiveInterval &LI = LIS->getInterval(MO.getReg()); + // If the regsiter isn't live prior to the SubReg def and not already + // marked as Undef, add Undef flag. + if (!LI.liveAt(Index) && !MO.isUndef()) { + MO.setIsUndef(true); + LiveRange::iterator NextSeg = std::next(LI.find(Index)); + if (NextSeg != LI.end() && NextSeg->valno) { + if (MachineInstr *NextMI = + LIS->getInstructionFromIndex(NextSeg->valno->def)) { + // Remove Undef flag from the next def of this register in the + // LiveInterval since the moved instruction already marks the + // SubReg's as Undef + if (MachineOperand *NextMO = + NextMI->findRegisterDefOperand(MO.getReg(), TRI)) { + NextMO->setIsUndef(false); + } + } + } + } + } + } + } + // Recede RegionBegin if an instruction moves above the first. if (RegionBegin == InsertPos) RegionBegin = MI; diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index 9991cb1837e01..faabac2538d6e 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -729,7 +729,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 - ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -2795,7 +2795,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 - ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -2960,7 +2960,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 - ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -3138,7 +3138,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 - ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -3328,7 +3328,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 - ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -6353,7 +6353,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec - ; GFX908-NEXT: dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX908-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: S_BRANCH %bb.1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: @@ -6553,7 +6553,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec - ; GFX908-NEXT: dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX908-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: S_BRANCH %bb.1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-undef-reorder.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-undef-reorder.mir index eefed7d379580..c46d5aece8cd0 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-undef-reorder.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-undef-reorder.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX900 %s --- @@ -8,6 +9,15 @@ machineFunctionInfo: body: | bb.0: liveins: $vgpr2 + ; GFX900-LABEL: name: test_undef_reorder + ; GFX900: liveins: $vgpr2 + ; GFX900-NEXT: {{ $}} + ; GFX900-NEXT: $vgpr5 = COPY $vgpr2 + ; GFX900-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr5 + ; GFX900-NEXT: $vgpr4 = COPY $vgpr2 + ; GFX900-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr4 + ; GFX900-NEXT: dead [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX900-NEXT: S_ENDPGM 0 $vgpr5 = COPY $vgpr2 $vgpr4 = COPY $vgpr2 undef %18.sub1:vreg_64 = COPY killed $vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll index e130ce0a45467..71e0e5afa81a0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll @@ -734,10 +734,10 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v4, v5, v6, v7 ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v10, v4 -; GFX900-NEXT: global_store_dwordx4 v0, v[7:10], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[16:17] ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31]