diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e934152d08acb..0c653b1b46d65 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1169,11 +1169,18 @@ void SIFoldOperandsImpl::foldOperand( // Grab the use operands first SmallVector UsesToProcess( llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg))); - for (auto *RSUse : UsesToProcess) { + for (unsigned I = 0; I != UsesToProcess.size(); ++I) { + MachineOperand *RSUse = UsesToProcess[I]; MachineInstr *RSUseMI = RSUse->getParent(); unsigned OpNo = RSUseMI->getOperandNo(RSUse); if (SplatRC) { + if (RSUseMI->isCopy()) { + Register DstReg = RSUseMI->getOperand(0).getReg(); + append_range(UsesToProcess, + make_pointer_range(MRI->use_nodbg_operands(DstReg))); + continue; + } if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) { FoldableDef SplatDef(SplatVal, SplatRC); appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef); diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 5b0d2d22c3e3d..42401afb6edf2 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -3238,11 +3238,8 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0: ; GFX90A-GISEL: ; %bb.0: ; %bb ; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2 -; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0 ; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 ; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -3253,11 +3250,8 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; GFX942-GISEL-LABEL: fadd_fadd_fsub_0: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX942-GISEL-NEXT: s_mov_b32 s3, s2 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0 ; GFX942-GISEL-NEXT: s_nop 0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir index 7852f5d0c96f5..23b24a22b69bc 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir @@ -1,11 +1,23 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-fold-operands -verify-machineinstrs -o - %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s +# Check that we don't hang on this. --- name: fold_reg_sequence body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_reg_sequence + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 429 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 $vgpr2, [[REG_SEQUENCE]].sub0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[GLOBAL_LOAD_DWORD]], [[REG_SEQUENCE]].sub0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 %0:sreg_32 = S_MOV_B32 0 %1:sreg_32 = S_MOV_B32 429 %2:sreg_64 = REG_SEQUENCE killed %1, %subreg.sub0, %0, %subreg.sub1 @@ -13,6 +25,20 @@ body: | %4:vgpr_32 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32), addrspace 1) %5:vgpr_32 = V_MUL_HI_U32_e64 %4, %2.sub0, implicit $exec S_ENDPGM 0 - ... +# Fold through a COPY of REG_SEQUENCE. +--- +name: fold_through_copy +body: | + bb.0: + ; CHECK-LABEL: name: fold_through_copy + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, [[DEF]], 8, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %0:sreg_32 = S_MOV_B32 0 + %1:sreg_64 = REG_SEQUENCE %0:sreg_32, %subreg.sub0, %0:sreg_32, %subreg.sub1 + %2:sreg_64_xexec = IMPLICIT_DEF + %3:vreg_64_align2 = COPY %1:sreg_64 + %4:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %2:sreg_64_xexec, 8, %3:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec +...