From 9971bf8ade79a4876c433b4ef7ac060857d72d3e Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Thu, 3 Apr 2025 15:07:17 +0200 Subject: [PATCH 1/4] [AMDGPU] Support arbitrary types in amdgcn.dead Legalize the amdgcn.dead intrinsic to work with types other than i32. It still generates IMPLICIT_DEFs. Remove some of the previous code for selecting/reg bank mapping it for 32-bit types, since everything is done in the legalizer now. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 6 - .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 6 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 1 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 + llvm/lib/Target/AMDGPU/SIInstructions.td | 6 - .../CodeGen/AMDGPU/legalize-amdgcn.dead.mir | 32 ++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll | 384 +++++++++++++++++- 7 files changed, 430 insertions(+), 16 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/legalize-amdgcn.dead.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 6ef7505ec6f62..e6caffe61e705 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1191,12 +1191,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { case Intrinsic::amdgcn_permlane16_swap: case Intrinsic::amdgcn_permlane32_swap: return selectPermlaneSwapIntrin(I, IntrinsicID); - case Intrinsic::amdgcn_dead: { - I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); - I.removeOperand(1); // drop intrinsic ID - return RBI.constrainGenericRegister(I.getOperand(0).getReg(), - AMDGPU::VGPR_32RegClass, *MRI); - } default: return selectImpl(I, *CoverageInfo); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 275d0193452a5..5d35a15123d63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7651,6 +7651,12 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return legalizeLaneOp(Helper, MI, IntrID); case Intrinsic::amdgcn_s_buffer_prefetch_data: return legalizeSBufferPrefetch(Helper, MI); + case Intrinsic::amdgcn_dead: { + for (const MachineOperand &Def : MI.defs()) + B.buildUndef(Def); + MI.eraseFromParent(); + return true; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 1d0e81db5a5db..f38665ee81bda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4701,7 +4701,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_set_inactive_chain_arg: case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_ds_bpermute_fi_b32: - case Intrinsic::amdgcn_dead: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_cvt_pkrtz: if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 356040da95672..006717d141027 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6629,6 +6629,11 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(LoadVal); return; } + case Intrinsic::amdgcn_dead: { + for (unsigned I = 0, E = N->getNumValues(); I < E; ++I) + Results.push_back(DAG.getUNDEF(N->getValueType(I))); + return; + } } break; } @@ -9116,6 +9121,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_mov_dpp8: case Intrinsic::amdgcn_update_dpp: return lowerLaneOp(*this, Op.getNode(), DAG); + case Intrinsic::amdgcn_dead: { + SmallVector Undefs; + for (unsigned I = 0, E = Op.getNode()->getNumValues(); I != E; ++I) + Undefs.push_back(DAG.getUNDEF(Op.getNode()->getValueType(I))); + return DAG.getMergeValues(Undefs, SDLoc(Op)); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9051db0c01ed1..fe384b33911b9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -4484,9 +4484,3 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> { let hasSideEffects = 1; let SubtargetPredicate = isGFX10Plus; } - -// FIXME: Would be nice if we could set the register class for the destination -// register too. -def IMP_DEF_FROM_INTRINSIC: Pat< - (i32 (int_amdgcn_dead)), (IMPLICIT_DEF)>; - diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.dead.mir b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.dead.mir new file mode 100644 index 0000000000000..ec940f8d3b0b0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.dead.mir @@ -0,0 +1,32 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amdpal -mcpu=gfx1200 -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: test_struct +body: | + bb.1.entry: + + ; CHECK-LABEL: name: test_struct + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](<3 x s32>) + ; CHECK-NEXT: $vgpr0 = COPY [[DEF]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[UV1]](s32) + ; CHECK-NEXT: $vgpr3 = COPY [[UV2]](s32) + ; CHECK-NEXT: $vgpr4_vgpr5 = COPY [[DEF2]](s64) + ; CHECK-NEXT: $vgpr6 = COPY [[DEF3]](<2 x s16>) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %0:_(s32), %1:_(<3 x s32>), %2:_(s64), %3:_(<2 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.dead) + + %4:_(s32), %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %1(<3 x s32>) + $vgpr0 = COPY %0(s32) + $vgpr1 = COPY %4(s32) + $vgpr2 = COPY %5(s32) + $vgpr3 = COPY %6(s32) + $vgpr4_vgpr5 = COPY %2(s64) + $vgpr6 = COPY %3(<2 x s16>) + SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll index a009854542f21..ad3a316c4c91c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll @@ -3,8 +3,8 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-GISEL %s ; Test that we can use v0 for temporaries in the if.then block. -define i32 @dead(i1 %cond, i32 %x, ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) #0 { -; ASM-DAG-LABEL: dead: +define i32 @dead_i32(i1 %cond, i32 %x, ptr addrspace(1) %ptr1) #0 { +; ASM-DAG-LABEL: dead_i32: ; ASM-DAG: ; %bb.0: ; %entry ; ASM-DAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; ASM-DAG-NEXT: s_wait_expcnt 0x0 @@ -27,7 +27,7 @@ define i32 @dead(i1 %cond, i32 %x, ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr ; ASM-DAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; ASM-DAG-NEXT: s_setpc_b64 s[30:31] ; -; ASM-GISEL-LABEL: dead: +; ASM-GISEL-LABEL: dead_i32: ; ASM-GISEL: ; %bb.0: ; %entry ; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; ASM-GISEL-NEXT: s_wait_expcnt 0x0 @@ -62,3 +62,381 @@ if.end: %res = phi i32 [ %x, %entry ], [ %dead, %if.then ] ret i32 %res } + +%trivial_types = type { i32, float, <3 x i32>, i64, ptr addrspace(5), ptr addrspace(1), <4 x float>, { float, <2 x i16> } } + +define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1) %ptr1, i32 %v) #0 { +; ASM-DAG-LABEL: dead_struct: +; ASM-DAG: ; %bb.0: ; %entry +; ASM-DAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; ASM-DAG-NEXT: s_wait_expcnt 0x0 +; ASM-DAG-NEXT: s_wait_samplecnt 0x0 +; ASM-DAG-NEXT: s_wait_bvhcnt 0x0 +; ASM-DAG-NEXT: s_wait_kmcnt 0x0 +; ASM-DAG-NEXT: v_mov_b32_e32 v20, v0 +; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1 +; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo +; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v20 +; ASM-DAG-NEXT: v_cmpx_eq_u32_e32 1, v1 +; ASM-DAG-NEXT: s_cbranch_execz .LBB1_2 +; ASM-DAG-NEXT: ; %bb.1: ; %if.then +; ASM-DAG-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_add_nc_u32 v0, 15, v19 +; ASM-DAG-NEXT: v_mov_b32_e32 v2, 0x3fc00000 +; ASM-DAG-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; ASM-DAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; ASM-DAG-NEXT: ; implicit-def: $vgpr8 +; ASM-DAG-NEXT: ; implicit-def: $vgpr9_vgpr10 +; ASM-DAG-NEXT: ; implicit-def: $vgpr15 +; ASM-DAG-NEXT: ; implicit-def: $vgpr16 +; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; ASM-DAG-NEXT: v_dual_mov_b32 v12, v11 :: v_dual_mov_b32 v13, v11 +; ASM-DAG-NEXT: v_mov_b32_e32 v14, v11 +; ASM-DAG-NEXT: global_store_b32 v[17:18], v0, off +; ASM-DAG-NEXT: ; implicit-def: $vgpr0 +; ASM-DAG-NEXT: .LBB1_2: ; %if.end +; ASM-DAG-NEXT: s_wait_alu 0xfffe +; ASM-DAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; ASM-DAG-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 +; ASM-DAG-NEXT: v_dual_mov_b32 v3, v4 :: v_dual_mov_b32 v4, v5 +; ASM-DAG-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 +; ASM-DAG-NEXT: v_dual_mov_b32 v7, v8 :: v_dual_mov_b32 v8, v9 +; ASM-DAG-NEXT: v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11 +; ASM-DAG-NEXT: v_dual_mov_b32 v11, v12 :: v_dual_mov_b32 v12, v13 +; ASM-DAG-NEXT: v_dual_mov_b32 v13, v14 :: v_dual_mov_b32 v14, v15 +; ASM-DAG-NEXT: v_mov_b32_e32 v15, v16 +; ASM-DAG-NEXT: s_setpc_b64 s[30:31] +; +; ASM-GISEL-LABEL: dead_struct: +; ASM-GISEL: ; %bb.0: ; %entry +; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; ASM-GISEL-NEXT: s_wait_expcnt 0x0 +; ASM-GISEL-NEXT: s_wait_samplecnt 0x0 +; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0 +; ASM-GISEL-NEXT: s_wait_kmcnt 0x0 +; ASM-GISEL-NEXT: v_mov_b32_e32 v20, v0 +; ASM-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2 +; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo +; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; ASM-GISEL-NEXT: v_and_b32_e32 v2, 1, v20 +; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v2 +; ASM-GISEL-NEXT: s_cbranch_execz .LBB1_2 +; ASM-GISEL-NEXT: ; %bb.1: ; %if.then +; ASM-GISEL-NEXT: s_mov_b32 s4, 0 +; ASM-GISEL-NEXT: s_mov_b32 s1, 0x3fc00000 +; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: s_mov_b32 s7, s4 +; ASM-GISEL-NEXT: s_mov_b32 s5, s4 +; ASM-GISEL-NEXT: s_mov_b32 s6, s4 +; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v13, s6 +; ASM-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 +; ASM-GISEL-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr8 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr15 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr16 +; ASM-GISEL-NEXT: global_store_b32 v[17:18], v0, off +; ASM-GISEL-NEXT: ; implicit-def: $vgpr0 +; ASM-GISEL-NEXT: .LBB1_2: ; %if.end +; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; ASM-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 +; ASM-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 +; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v8 +; ASM-GISEL-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10 +; ASM-GISEL-NEXT: v_dual_mov_b32 v10, v11 :: v_dual_mov_b32 v11, v12 +; ASM-GISEL-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14 +; ASM-GISEL-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16 +; ASM-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + br i1 %cond, label %if.then, label %if.end + +if.then: + %dead = call %trivial_types @llvm.amdgcn.dead.s_trivial_typess() + %dead_insert_1 = insertvalue %trivial_types %dead, float 1.5, 1 + %dead_insert_3 = insertvalue %trivial_types %dead_insert_1, <4 x float> zeroinitializer, 6 + + %vgpr_use = add i32 %v, 15 ; may use v0 or one of the other implicit_defs + store i32 %vgpr_use, ptr addrspace(1) %ptr1 + + br label %if.end + +if.end: + %res = phi %trivial_types [ %x, %entry ], [ %dead_insert_3, %if.then ] + ret %trivial_types %res +} + +define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i32 %v) #0 { +; ASM-DAG-LABEL: dead_array: +; ASM-DAG: ; %bb.0: ; %entry +; ASM-DAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; ASM-DAG-NEXT: s_wait_expcnt 0x0 +; ASM-DAG-NEXT: s_wait_samplecnt 0x0 +; ASM-DAG-NEXT: s_wait_bvhcnt 0x0 +; ASM-DAG-NEXT: s_wait_kmcnt 0x0 +; ASM-DAG-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v0 +; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1 +; ASM-DAG-NEXT: s_clause 0x4 +; ASM-DAG-NEXT: scratch_load_b32 v35, off, s32 offset:12 +; ASM-DAG-NEXT: scratch_load_b32 v34, off, s32 offset:8 +; ASM-DAG-NEXT: scratch_load_b32 v31, off, s32 offset:4 +; ASM-DAG-NEXT: scratch_load_b32 v30, off, s32 +; ASM-DAG-NEXT: scratch_load_b32 v1, off, s32 offset:16 +; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo +; ASM-DAG-NEXT: v_and_b32_e32 v33, 1, v33 +; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ASM-DAG-NEXT: v_cmpx_eq_u32_e32 1, v33 +; ASM-DAG-NEXT: s_cbranch_execz .LBB2_2 +; ASM-DAG-NEXT: ; %bb.1: ; %if.then +; ASM-DAG-NEXT: v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v7, 13 +; ASM-DAG-NEXT: s_wait_loadcnt 0x0 +; ASM-DAG-NEXT: v_add_nc_u32_e32 v0, 15, v1 +; ASM-DAG-NEXT: ; implicit-def: $vgpr2 +; ASM-DAG-NEXT: ; implicit-def: $vgpr3 +; ASM-DAG-NEXT: ; implicit-def: $vgpr4 +; ASM-DAG-NEXT: ; implicit-def: $vgpr5 +; ASM-DAG-NEXT: ; implicit-def: $vgpr6 +; ASM-DAG-NEXT: ; implicit-def: $vgpr9 +; ASM-DAG-NEXT: ; implicit-def: $vgpr10 +; ASM-DAG-NEXT: ; implicit-def: $vgpr11 +; ASM-DAG-NEXT: ; implicit-def: $vgpr12 +; ASM-DAG-NEXT: ; implicit-def: $vgpr13 +; ASM-DAG-NEXT: ; implicit-def: $vgpr14 +; ASM-DAG-NEXT: ; implicit-def: $vgpr15 +; ASM-DAG-NEXT: ; implicit-def: $vgpr16 +; ASM-DAG-NEXT: ; implicit-def: $vgpr17 +; ASM-DAG-NEXT: ; implicit-def: $vgpr18 +; ASM-DAG-NEXT: ; implicit-def: $vgpr19 +; ASM-DAG-NEXT: ; implicit-def: $vgpr20 +; ASM-DAG-NEXT: ; implicit-def: $vgpr21 +; ASM-DAG-NEXT: ; implicit-def: $vgpr22 +; ASM-DAG-NEXT: ; implicit-def: $vgpr23 +; ASM-DAG-NEXT: ; implicit-def: $vgpr24 +; ASM-DAG-NEXT: ; implicit-def: $vgpr25 +; ASM-DAG-NEXT: ; implicit-def: $vgpr26 +; ASM-DAG-NEXT: ; implicit-def: $vgpr27 +; ASM-DAG-NEXT: ; implicit-def: $vgpr28 +; ASM-DAG-NEXT: ; implicit-def: $vgpr29 +; ASM-DAG-NEXT: ; implicit-def: $vgpr32 +; ASM-DAG-NEXT: ; implicit-def: $vgpr30 +; ASM-DAG-NEXT: ; implicit-def: $vgpr31 +; ASM-DAG-NEXT: global_store_b32 v[34:35], v0, off +; ASM-DAG-NEXT: ; implicit-def: $vgpr0 +; ASM-DAG-NEXT: .LBB2_2: ; %if.end +; ASM-DAG-NEXT: s_wait_alu 0xfffe +; ASM-DAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; ASM-DAG-NEXT: s_wait_loadcnt 0x0 +; ASM-DAG-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 +; ASM-DAG-NEXT: v_dual_mov_b32 v3, v4 :: v_dual_mov_b32 v4, v5 +; ASM-DAG-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 +; ASM-DAG-NEXT: v_dual_mov_b32 v7, v8 :: v_dual_mov_b32 v8, v9 +; ASM-DAG-NEXT: v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11 +; ASM-DAG-NEXT: v_dual_mov_b32 v11, v12 :: v_dual_mov_b32 v12, v13 +; ASM-DAG-NEXT: v_dual_mov_b32 v13, v14 :: v_dual_mov_b32 v14, v15 +; ASM-DAG-NEXT: v_dual_mov_b32 v15, v16 :: v_dual_mov_b32 v16, v17 +; ASM-DAG-NEXT: v_dual_mov_b32 v17, v18 :: v_dual_mov_b32 v18, v19 +; ASM-DAG-NEXT: v_dual_mov_b32 v19, v20 :: v_dual_mov_b32 v20, v21 +; ASM-DAG-NEXT: v_dual_mov_b32 v21, v22 :: v_dual_mov_b32 v22, v23 +; ASM-DAG-NEXT: v_dual_mov_b32 v23, v24 :: v_dual_mov_b32 v24, v25 +; ASM-DAG-NEXT: v_dual_mov_b32 v25, v26 :: v_dual_mov_b32 v26, v27 +; ASM-DAG-NEXT: v_dual_mov_b32 v27, v28 :: v_dual_mov_b32 v28, v29 +; ASM-DAG-NEXT: v_mov_b32_e32 v29, v32 +; ASM-DAG-NEXT: s_setpc_b64 s[30:31] +; +; ASM-GISEL-LABEL: dead_array: +; ASM-GISEL: ; %bb.0: ; %entry +; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; ASM-GISEL-NEXT: s_wait_expcnt 0x0 +; ASM-GISEL-NEXT: s_wait_samplecnt 0x0 +; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0 +; ASM-GISEL-NEXT: s_wait_kmcnt 0x0 +; ASM-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; ASM-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2 +; ASM-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 +; ASM-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 +; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v8 +; ASM-GISEL-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10 +; ASM-GISEL-NEXT: v_dual_mov_b32 v10, v11 :: v_dual_mov_b32 v11, v12 +; ASM-GISEL-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14 +; ASM-GISEL-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16 +; ASM-GISEL-NEXT: v_dual_mov_b32 v16, v17 :: v_dual_mov_b32 v17, v18 +; ASM-GISEL-NEXT: v_dual_mov_b32 v18, v19 :: v_dual_mov_b32 v19, v20 +; ASM-GISEL-NEXT: v_dual_mov_b32 v20, v21 :: v_dual_mov_b32 v21, v22 +; ASM-GISEL-NEXT: v_dual_mov_b32 v22, v23 :: v_dual_mov_b32 v23, v24 +; ASM-GISEL-NEXT: v_dual_mov_b32 v24, v25 :: v_dual_mov_b32 v25, v26 +; ASM-GISEL-NEXT: v_dual_mov_b32 v26, v27 :: v_dual_mov_b32 v27, v28 +; ASM-GISEL-NEXT: v_dual_mov_b32 v28, v29 :: v_dual_mov_b32 v29, v30 +; ASM-GISEL-NEXT: s_clause 0x4 +; ASM-GISEL-NEXT: scratch_load_b32 v30, off, s32 +; ASM-GISEL-NEXT: scratch_load_b32 v31, off, s32 offset:4 +; ASM-GISEL-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; ASM-GISEL-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; ASM-GISEL-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; ASM-GISEL-NEXT: v_and_b32_e32 v32, 1, v32 +; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo +; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v32 +; ASM-GISEL-NEXT: s_cbranch_execz .LBB2_2 +; ASM-GISEL-NEXT: ; %bb.1: ; %if.then +; ASM-GISEL-NEXT: s_mov_b32 s1, 15 +; ASM-GISEL-NEXT: s_mov_b32 s2, 13 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x0 +; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_add_nc_u32 v0, 15, v35 +; ASM-GISEL-NEXT: v_mov_b32_e32 v6, s2 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr1 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr2 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr3 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr4 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr5 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr8 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr9 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr10 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr11 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr12 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr13 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr14 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr15 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr16 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr17 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr18 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr19 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr20 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr21 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr22 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr23 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr24 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr25 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr26 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr27 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr28 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr29 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr30 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr31 +; ASM-GISEL-NEXT: global_store_b32 v[33:34], v0, off +; ASM-GISEL-NEXT: ; implicit-def: $vgpr0 +; ASM-GISEL-NEXT: .LBB2_2: ; %if.end +; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x0 +; ASM-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + br i1 %cond, label %if.then, label %if.end + +if.then: + %dead = call [32 x i32] @llvm.amdgcn.dead() + %dead_insert_1 = insertvalue [32 x i32] %dead, i32 15, 7 + %dead_insert_3 = insertvalue [32 x i32] %dead_insert_1, i32 13, 6 + + %vgpr_use = add i32 %v, 15 ; may use v0 or one of the other implicit_defs + store i32 %vgpr_use, ptr addrspace(1) %ptr1 + + br label %if.end + +if.end: + %res = phi [32 x i32] [ %x, %entry ], [ %dead_insert_3, %if.then ] + ret [32 x i32] %res +} + +%non_trivial_types = type { i8, i16, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, <5 x i32>, i128} + +define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr addrspace(1) %ptr1, i32 %v) #0 { +; ASM-DAG-LABEL: dead_non_trivial: +; ASM-DAG: ; %bb.0: ; %entry +; ASM-DAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; ASM-DAG-NEXT: s_wait_expcnt 0x0 +; ASM-DAG-NEXT: s_wait_samplecnt 0x0 +; ASM-DAG-NEXT: s_wait_bvhcnt 0x0 +; ASM-DAG-NEXT: s_wait_kmcnt 0x0 +; ASM-DAG-NEXT: v_mov_b32_e32 v20, v0 +; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1 +; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo +; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v20 +; ASM-DAG-NEXT: v_cmpx_eq_u32_e32 1, v1 +; ASM-DAG-NEXT: s_cbranch_execz .LBB3_2 +; ASM-DAG-NEXT: ; %bb.1: ; %if.then +; ASM-DAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_add_nc_u32 v0, 15, v19 +; ASM-DAG-NEXT: v_mov_b32_e32 v3, 0x3e00 +; ASM-DAG-NEXT: ; implicit-def: $vgpr2 +; ASM-DAG-NEXT: ; implicit-def: $vgpr4 +; ASM-DAG-NEXT: ; implicit-def: $vgpr5 +; ASM-DAG-NEXT: ; implicit-def: $vgpr6 +; ASM-DAG-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 +; ASM-DAG-NEXT: ; implicit-def: $vgpr13_vgpr14 +; ASM-DAG-NEXT: ; implicit-def: $vgpr15_vgpr16 +; ASM-DAG-NEXT: global_store_b32 v[17:18], v0, off +; ASM-DAG-NEXT: ; implicit-def: $vgpr0 +; ASM-DAG-NEXT: .LBB3_2: ; %if.end +; ASM-DAG-NEXT: s_wait_alu 0xfffe +; ASM-DAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; ASM-DAG-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 +; ASM-DAG-NEXT: v_dual_mov_b32 v3, v4 :: v_dual_mov_b32 v4, v5 +; ASM-DAG-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 +; ASM-DAG-NEXT: v_dual_mov_b32 v7, v8 :: v_dual_mov_b32 v8, v9 +; ASM-DAG-NEXT: v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11 +; ASM-DAG-NEXT: v_dual_mov_b32 v11, v12 :: v_dual_mov_b32 v12, v13 +; ASM-DAG-NEXT: v_dual_mov_b32 v13, v14 :: v_dual_mov_b32 v14, v15 +; ASM-DAG-NEXT: v_mov_b32_e32 v15, v16 +; ASM-DAG-NEXT: s_setpc_b64 s[30:31] +; +; ASM-GISEL-LABEL: dead_non_trivial: +; ASM-GISEL: ; %bb.0: ; %entry +; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; ASM-GISEL-NEXT: s_wait_expcnt 0x0 +; ASM-GISEL-NEXT: s_wait_samplecnt 0x0 +; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0 +; ASM-GISEL-NEXT: s_wait_kmcnt 0x0 +; ASM-GISEL-NEXT: v_mov_b32_e32 v20, v0 +; ASM-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2 +; ASM-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 +; ASM-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 +; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_and_b32 v7, 1, v20 +; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo +; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v7 +; ASM-GISEL-NEXT: s_cbranch_execz .LBB3_2 +; ASM-GISEL-NEXT: ; %bb.1: ; %if.then +; ASM-GISEL-NEXT: s_movk_i32 s1, 0x3e00 +; ASM-GISEL-NEXT: s_mov_b32 s2, 0 +; ASM-GISEL-NEXT: v_add_nc_u32_e32 v0, 15, v19 +; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: v_mov_b32_e32 v2, s1 +; ASM-GISEL-NEXT: v_mov_b32_e32 v6, s2 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr1 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr3 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr4 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr5 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15_vgpr16 +; ASM-GISEL-NEXT: global_store_b32 v[17:18], v0, off +; ASM-GISEL-NEXT: ; implicit-def: $vgpr0 +; ASM-GISEL-NEXT: .LBB3_2: ; %if.end +; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; ASM-GISEL-NEXT: v_dual_mov_b32 v7, v8 :: v_dual_mov_b32 v8, v9 +; ASM-GISEL-NEXT: v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11 +; ASM-GISEL-NEXT: v_dual_mov_b32 v11, v12 :: v_dual_mov_b32 v12, v13 +; ASM-GISEL-NEXT: v_dual_mov_b32 v13, v14 :: v_dual_mov_b32 v14, v15 +; ASM-GISEL-NEXT: v_mov_b32_e32 v15, v16 +; ASM-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + br i1 %cond, label %if.then, label %if.end + +if.then: + %dead = call %non_trivial_types @llvm.amdgcn.dead.s_non_trivial_typess() + %dead_insert_1 = insertvalue %non_trivial_types %dead, half 1.5, 2 + %dead_insert_3 = insertvalue %non_trivial_types %dead_insert_1, <2 x bfloat> zeroinitializer, 6 + + %vgpr_use = add i32 %v, 15 ; may use v0 or one of the other implicit_defs + store i32 %vgpr_use, ptr addrspace(1) %ptr1 + + br label %if.end + +if.end: + %res = phi %non_trivial_types [ %x, %entry ], [ %dead_insert_3, %if.then ] + ret %non_trivial_types %res +} From bbc9f776ef9d993b4247f2b728740bb48206cedb Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Wed, 23 Apr 2025 10:36:30 +0200 Subject: [PATCH 2/4] Address review comments --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 1 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 +-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll | 55 ++++++++++++++----- 3 files changed, 45 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 5d35a15123d63..152f0c00b7994 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7652,6 +7652,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_s_buffer_prefetch_data: return legalizeSBufferPrefetch(Helper, MI); case Intrinsic::amdgcn_dead: { + // TODO: Use poison instead of undef for (const MachineOperand &Def : MI.defs()) B.buildUndef(Def); MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 006717d141027..8c5fc34f385b3 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6631,7 +6631,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } case Intrinsic::amdgcn_dead: { for (unsigned I = 0, E = N->getNumValues(); I < E; ++I) - Results.push_back(DAG.getUNDEF(N->getValueType(I))); + Results.push_back(DAG.getPOISON(N->getValueType(I))); return; } } @@ -9122,10 +9122,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_update_dpp: return lowerLaneOp(*this, Op.getNode(), DAG); case Intrinsic::amdgcn_dead: { - SmallVector Undefs; + SmallVector Poisons; for (unsigned I = 0, E = Op.getNode()->getNumValues(); I != E; ++I) - Undefs.push_back(DAG.getUNDEF(Op.getNode()->getValueType(I))); - return DAG.getMergeValues(Undefs, SDLoc(Op)); + Poisons.push_back(DAG.getPOISON(Op.getNode()->getValueType(I))); + return DAG.getMergeValues(Poisons, SDLoc(Op)); } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll index ad3a316c4c91c..cb45758bba6d2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll @@ -341,7 +341,7 @@ if.end: ret [32 x i32] %res } -%non_trivial_types = type { i8, i16, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, <5 x i32>, i128} +%non_trivial_types = type { i8, i16, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, <5 x i32>, i128, <7 x i16>} define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr addrspace(1) %ptr1, i32 %v) #0 { ; ASM-DAG-LABEL: dead_non_trivial: @@ -351,15 +351,15 @@ define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr ; ASM-DAG-NEXT: s_wait_samplecnt 0x0 ; ASM-DAG-NEXT: s_wait_bvhcnt 0x0 ; ASM-DAG-NEXT: s_wait_kmcnt 0x0 -; ASM-DAG-NEXT: v_mov_b32_e32 v20, v0 +; ASM-DAG-NEXT: v_mov_b32_e32 v24, v0 ; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1 ; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo ; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v20 +; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v24 ; ASM-DAG-NEXT: v_cmpx_eq_u32_e32 1, v1 ; ASM-DAG-NEXT: s_cbranch_execz .LBB3_2 ; ASM-DAG-NEXT: ; %bb.1: ; %if.then -; ASM-DAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_add_nc_u32 v0, 15, v19 +; ASM-DAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_add_nc_u32 v0, 15, v23 ; ASM-DAG-NEXT: v_mov_b32_e32 v3, 0x3e00 ; ASM-DAG-NEXT: ; implicit-def: $vgpr2 ; ASM-DAG-NEXT: ; implicit-def: $vgpr4 @@ -368,7 +368,8 @@ define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr ; ASM-DAG-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 ; ASM-DAG-NEXT: ; implicit-def: $vgpr13_vgpr14 ; ASM-DAG-NEXT: ; implicit-def: $vgpr15_vgpr16 -; ASM-DAG-NEXT: global_store_b32 v[17:18], v0, off +; ASM-DAG-NEXT: ; implicit-def: $vgpr17_vgpr18_vgpr19_vgpr20 +; ASM-DAG-NEXT: global_store_b32 v[21:22], v0, off ; ASM-DAG-NEXT: ; implicit-def: $vgpr0 ; ASM-DAG-NEXT: .LBB3_2: ; %if.end ; ASM-DAG-NEXT: s_wait_alu 0xfffe @@ -380,7 +381,9 @@ define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr ; ASM-DAG-NEXT: v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11 ; ASM-DAG-NEXT: v_dual_mov_b32 v11, v12 :: v_dual_mov_b32 v12, v13 ; ASM-DAG-NEXT: v_dual_mov_b32 v13, v14 :: v_dual_mov_b32 v14, v15 -; ASM-DAG-NEXT: v_mov_b32_e32 v15, v16 +; ASM-DAG-NEXT: v_dual_mov_b32 v15, v16 :: v_dual_mov_b32 v16, v17 +; ASM-DAG-NEXT: v_dual_mov_b32 v17, v18 :: v_dual_mov_b32 v18, v19 +; ASM-DAG-NEXT: v_mov_b32_e32 v19, v20 ; ASM-DAG-NEXT: s_setpc_b64 s[30:31] ; ; ASM-GISEL-LABEL: dead_non_trivial: @@ -390,38 +393,60 @@ define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr ; ASM-GISEL-NEXT: s_wait_samplecnt 0x0 ; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0 ; ASM-GISEL-NEXT: s_wait_kmcnt 0x0 -; ASM-GISEL-NEXT: v_mov_b32_e32 v20, v0 +; ASM-GISEL-NEXT: v_mov_b32_e32 v24, v0 ; ASM-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2 ; ASM-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 ; ASM-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 -; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_and_b32 v7, 1, v20 -; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo -; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v7 +; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v19 +; ASM-GISEL-NEXT: v_dual_mov_b32 v19, v20 :: v_dual_and_b32 v20, 1, v24 +; ASM-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; ASM-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; ASM-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; ASM-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; ASM-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; ASM-GISEL-NEXT: s_cbranch_execz .LBB3_2 ; ASM-GISEL-NEXT: ; %bb.1: ; %if.then ; ASM-GISEL-NEXT: s_movk_i32 s1, 0x3e00 ; ASM-GISEL-NEXT: s_mov_b32 s2, 0 -; ASM-GISEL-NEXT: v_add_nc_u32_e32 v0, 15, v19 ; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: s_lshr_b32 s3, s0, 16 +; ASM-GISEL-NEXT: s_lshr_b32 s4, s0, 16 +; ASM-GISEL-NEXT: s_lshr_b32 s5, s0, 16 +; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: v_dual_mov_b32 v25, s5 :: v_dual_add_nc_u32 v0, 15, v23 ; ASM-GISEL-NEXT: v_mov_b32_e32 v2, s1 ; ASM-GISEL-NEXT: v_mov_b32_e32 v6, s2 +; ASM-GISEL-NEXT: v_mov_b32_e32 v20, s3 +; ASM-GISEL-NEXT: v_mov_b32_e32 v24, s4 +; ASM-GISEL-NEXT: global_store_b32 v[21:22], v0, off +; ASM-GISEL-NEXT: ; implicit-def: $vgpr0 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr1 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr3 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr4 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr5 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15_vgpr16 -; ASM-GISEL-NEXT: global_store_b32 v[17:18], v0, off -; ASM-GISEL-NEXT: ; implicit-def: $vgpr0 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr17 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr18 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr7 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr19 ; ASM-GISEL-NEXT: .LBB3_2: ; %if.end ; ASM-GISEL-NEXT: s_wait_alu 0xfffe ; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; ASM-GISEL-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; ASM-GISEL-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; ASM-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; ASM-GISEL-NEXT: v_lshl_or_b32 v20, v20, 16, v17 +; ASM-GISEL-NEXT: v_lshl_or_b32 v17, v24, 16, v18 +; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; ASM-GISEL-NEXT: v_lshl_or_b32 v18, v25, 16, v7 ; ASM-GISEL-NEXT: v_dual_mov_b32 v7, v8 :: v_dual_mov_b32 v8, v9 ; ASM-GISEL-NEXT: v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11 ; ASM-GISEL-NEXT: v_dual_mov_b32 v11, v12 :: v_dual_mov_b32 v12, v13 ; ASM-GISEL-NEXT: v_dual_mov_b32 v13, v14 :: v_dual_mov_b32 v14, v15 -; ASM-GISEL-NEXT: v_mov_b32_e32 v15, v16 +; ASM-GISEL-NEXT: v_dual_mov_b32 v15, v16 :: v_dual_mov_b32 v16, v20 ; ASM-GISEL-NEXT: s_setpc_b64 s[30:31] entry: br i1 %cond, label %if.then, label %if.end From fbc9974357c6ee165e60527d126065e5dd418088 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Wed, 23 Apr 2025 11:10:31 +0200 Subject: [PATCH 3/4] Add test with array in struct --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll | 311 +++++++++++++++---- 1 file changed, 243 insertions(+), 68 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll index cb45758bba6d2..7fdd9680fb4b2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll @@ -341,7 +341,7 @@ if.end: ret [32 x i32] %res } -%non_trivial_types = type { i8, i16, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, <5 x i32>, i128, <7 x i16>} +%non_trivial_types = type { i8, i16, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, <5 x i32>, i128, [32 x i32] } define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr addrspace(1) %ptr1, i32 %v) #0 { ; ASM-DAG-LABEL: dead_non_trivial: @@ -351,39 +351,127 @@ define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr ; ASM-DAG-NEXT: s_wait_samplecnt 0x0 ; ASM-DAG-NEXT: s_wait_bvhcnt 0x0 ; ASM-DAG-NEXT: s_wait_kmcnt 0x0 -; ASM-DAG-NEXT: v_mov_b32_e32 v24, v0 -; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1 +; ASM-DAG-NEXT: s_clause 0x15 +; ASM-DAG-NEXT: scratch_load_b32 v32, off, s32 offset:80 +; ASM-DAG-NEXT: scratch_load_b32 v31, off, s32 offset:76 +; ASM-DAG-NEXT: scratch_load_b32 v33, off, s32 offset:72 +; ASM-DAG-NEXT: scratch_load_b32 v34, off, s32 offset:68 +; ASM-DAG-NEXT: scratch_load_b32 v35, off, s32 offset:64 +; ASM-DAG-NEXT: scratch_load_b32 v36, off, s32 offset:60 +; ASM-DAG-NEXT: scratch_load_b32 v37, off, s32 offset:56 +; ASM-DAG-NEXT: scratch_load_b32 v38, off, s32 offset:52 +; ASM-DAG-NEXT: scratch_load_b32 v39, off, s32 offset:48 +; ASM-DAG-NEXT: scratch_load_b32 v48, off, s32 offset:44 +; ASM-DAG-NEXT: scratch_load_b32 v49, off, s32 offset:40 +; ASM-DAG-NEXT: scratch_load_b32 v50, off, s32 offset:36 +; ASM-DAG-NEXT: scratch_load_b32 v51, off, s32 offset:32 +; ASM-DAG-NEXT: scratch_load_b32 v52, off, s32 offset:28 +; ASM-DAG-NEXT: scratch_load_b32 v53, off, s32 offset:24 +; ASM-DAG-NEXT: scratch_load_b32 v54, off, s32 offset:20 +; ASM-DAG-NEXT: scratch_load_b32 v55, off, s32 offset:16 +; ASM-DAG-NEXT: scratch_load_b32 v64, off, s32 offset:12 +; ASM-DAG-NEXT: scratch_load_b32 v65, off, s32 offset:8 +; ASM-DAG-NEXT: scratch_load_b32 v66, off, s32 offset:4 +; ASM-DAG-NEXT: scratch_load_b32 v67, off, s32 +; ASM-DAG-NEXT: scratch_load_b32 v68, off, s32 offset:84 +; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v1 ; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo -; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v24 +; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; ASM-DAG-NEXT: v_cmpx_eq_u32_e32 1, v1 ; ASM-DAG-NEXT: s_cbranch_execz .LBB3_2 ; ASM-DAG-NEXT: ; %bb.1: ; %if.then -; ASM-DAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_add_nc_u32 v0, 15, v23 -; ASM-DAG-NEXT: v_mov_b32_e32 v3, 0x3e00 +; ASM-DAG-NEXT: s_wait_loadcnt 0x0 +; ASM-DAG-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_add_nc_u32 v1, 15, v68 +; ASM-DAG-NEXT: v_mov_b32_e32 v4, 0x3e00 ; ASM-DAG-NEXT: ; implicit-def: $vgpr2 -; ASM-DAG-NEXT: ; implicit-def: $vgpr4 +; ASM-DAG-NEXT: ; implicit-def: $vgpr3 ; ASM-DAG-NEXT: ; implicit-def: $vgpr5 ; ASM-DAG-NEXT: ; implicit-def: $vgpr6 -; ASM-DAG-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 -; ASM-DAG-NEXT: ; implicit-def: $vgpr13_vgpr14 -; ASM-DAG-NEXT: ; implicit-def: $vgpr15_vgpr16 -; ASM-DAG-NEXT: ; implicit-def: $vgpr17_vgpr18_vgpr19_vgpr20 -; ASM-DAG-NEXT: global_store_b32 v[21:22], v0, off -; ASM-DAG-NEXT: ; implicit-def: $vgpr0 +; ASM-DAG-NEXT: ; implicit-def: $vgpr7 +; ASM-DAG-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12_vgpr13 +; ASM-DAG-NEXT: ; implicit-def: $vgpr14_vgpr15 +; ASM-DAG-NEXT: ; implicit-def: $vgpr18 +; ASM-DAG-NEXT: ; implicit-def: $vgpr19 +; ASM-DAG-NEXT: ; implicit-def: $vgpr20 +; ASM-DAG-NEXT: ; implicit-def: $vgpr21 +; ASM-DAG-NEXT: ; implicit-def: $vgpr22 +; ASM-DAG-NEXT: ; implicit-def: $vgpr23 +; ASM-DAG-NEXT: ; implicit-def: $vgpr24 +; ASM-DAG-NEXT: ; implicit-def: $vgpr25 +; ASM-DAG-NEXT: ; implicit-def: $vgpr26 +; ASM-DAG-NEXT: ; implicit-def: $vgpr27 +; ASM-DAG-NEXT: ; implicit-def: $vgpr28 +; ASM-DAG-NEXT: ; implicit-def: $vgpr29 +; ASM-DAG-NEXT: ; implicit-def: $vgpr30 +; ASM-DAG-NEXT: ; implicit-def: $vgpr67 +; ASM-DAG-NEXT: ; implicit-def: $vgpr66 +; ASM-DAG-NEXT: ; implicit-def: $vgpr65 +; ASM-DAG-NEXT: ; implicit-def: $vgpr64 +; ASM-DAG-NEXT: ; implicit-def: $vgpr55 +; ASM-DAG-NEXT: ; implicit-def: $vgpr54 +; ASM-DAG-NEXT: ; implicit-def: $vgpr53 +; ASM-DAG-NEXT: ; implicit-def: $vgpr52 +; ASM-DAG-NEXT: ; implicit-def: $vgpr51 +; ASM-DAG-NEXT: ; implicit-def: $vgpr50 +; ASM-DAG-NEXT: ; implicit-def: $vgpr49 +; ASM-DAG-NEXT: ; implicit-def: $vgpr48 +; ASM-DAG-NEXT: ; implicit-def: $vgpr39 +; ASM-DAG-NEXT: ; implicit-def: $vgpr38 +; ASM-DAG-NEXT: ; implicit-def: $vgpr37 +; ASM-DAG-NEXT: ; implicit-def: $vgpr36 +; ASM-DAG-NEXT: ; implicit-def: $vgpr35 +; ASM-DAG-NEXT: ; implicit-def: $vgpr34 +; ASM-DAG-NEXT: ; implicit-def: $vgpr33 +; ASM-DAG-NEXT: global_store_b32 v[31:32], v1, off ; ASM-DAG-NEXT: .LBB3_2: ; %if.end ; ASM-DAG-NEXT: s_wait_alu 0xfffe ; ASM-DAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; ASM-DAG-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 -; ASM-DAG-NEXT: v_dual_mov_b32 v3, v4 :: v_dual_mov_b32 v4, v5 -; ASM-DAG-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 -; ASM-DAG-NEXT: v_dual_mov_b32 v7, v8 :: v_dual_mov_b32 v8, v9 -; ASM-DAG-NEXT: v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11 -; ASM-DAG-NEXT: v_dual_mov_b32 v11, v12 :: v_dual_mov_b32 v12, v13 -; ASM-DAG-NEXT: v_dual_mov_b32 v13, v14 :: v_dual_mov_b32 v14, v15 -; ASM-DAG-NEXT: v_dual_mov_b32 v15, v16 :: v_dual_mov_b32 v16, v17 -; ASM-DAG-NEXT: v_dual_mov_b32 v17, v18 :: v_dual_mov_b32 v18, v19 -; ASM-DAG-NEXT: v_mov_b32_e32 v19, v20 +; ASM-DAG-NEXT: s_clause 0x16 +; ASM-DAG-NEXT: scratch_store_b16 v0, v3, off offset:2 +; ASM-DAG-NEXT: scratch_store_b8 v0, v2, off +; ASM-DAG-NEXT: scratch_store_b16 v0, v4, off offset:4 +; ASM-DAG-NEXT: scratch_store_b16 v0, v5, off offset:6 +; ASM-DAG-NEXT: scratch_store_b32 v0, v6, off offset:8 +; ASM-DAG-NEXT: scratch_store_b32 v0, v7, off offset:12 +; ASM-DAG-NEXT: scratch_store_b32 v0, v8, off offset:16 +; ASM-DAG-NEXT: scratch_store_b32 v0, v13, off offset:48 +; ASM-DAG-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; ASM-DAG-NEXT: scratch_store_b128 v0, v[14:17], off offset:64 +; ASM-DAG-NEXT: scratch_store_b32 v0, v18, off offset:80 +; ASM-DAG-NEXT: scratch_store_b32 v0, v19, off offset:84 +; ASM-DAG-NEXT: scratch_store_b32 v0, v20, off offset:88 +; ASM-DAG-NEXT: scratch_store_b32 v0, v21, off offset:92 +; ASM-DAG-NEXT: scratch_store_b32 v0, v22, off offset:96 +; ASM-DAG-NEXT: scratch_store_b32 v0, v23, off offset:100 +; ASM-DAG-NEXT: scratch_store_b32 v0, v24, off offset:104 +; ASM-DAG-NEXT: scratch_store_b32 v0, v25, off offset:108 +; ASM-DAG-NEXT: scratch_store_b32 v0, v26, off offset:112 +; ASM-DAG-NEXT: scratch_store_b32 v0, v27, off offset:116 +; ASM-DAG-NEXT: scratch_store_b32 v0, v28, off offset:120 +; ASM-DAG-NEXT: scratch_store_b32 v0, v29, off offset:124 +; ASM-DAG-NEXT: scratch_store_b32 v0, v30, off offset:128 +; ASM-DAG-NEXT: s_wait_loadcnt 0x1 +; ASM-DAG-NEXT: s_clause 0x12 +; ASM-DAG-NEXT: scratch_store_b32 v0, v67, off offset:132 +; ASM-DAG-NEXT: scratch_store_b32 v0, v66, off offset:136 +; ASM-DAG-NEXT: scratch_store_b32 v0, v65, off offset:140 +; ASM-DAG-NEXT: scratch_store_b32 v0, v64, off offset:144 +; ASM-DAG-NEXT: scratch_store_b32 v0, v55, off offset:148 +; ASM-DAG-NEXT: scratch_store_b32 v0, v54, off offset:152 +; ASM-DAG-NEXT: scratch_store_b32 v0, v53, off offset:156 +; ASM-DAG-NEXT: scratch_store_b32 v0, v52, off offset:160 +; ASM-DAG-NEXT: scratch_store_b32 v0, v51, off offset:164 +; ASM-DAG-NEXT: scratch_store_b32 v0, v50, off offset:168 +; ASM-DAG-NEXT: scratch_store_b32 v0, v49, off offset:172 +; ASM-DAG-NEXT: scratch_store_b32 v0, v48, off offset:176 +; ASM-DAG-NEXT: scratch_store_b32 v0, v39, off offset:180 +; ASM-DAG-NEXT: scratch_store_b32 v0, v38, off offset:184 +; ASM-DAG-NEXT: scratch_store_b32 v0, v37, off offset:188 +; ASM-DAG-NEXT: scratch_store_b32 v0, v36, off offset:192 +; ASM-DAG-NEXT: scratch_store_b32 v0, v35, off offset:196 +; ASM-DAG-NEXT: scratch_store_b32 v0, v34, off offset:200 +; ASM-DAG-NEXT: scratch_store_b32 v0, v33, off offset:204 +; ASM-DAG-NEXT: s_wait_loadcnt 0x0 ; ASM-DAG-NEXT: s_setpc_b64 s[30:31] ; ; ASM-GISEL-LABEL: dead_non_trivial: @@ -393,60 +481,147 @@ define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr ; ASM-GISEL-NEXT: s_wait_samplecnt 0x0 ; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0 ; ASM-GISEL-NEXT: s_wait_kmcnt 0x0 -; ASM-GISEL-NEXT: v_mov_b32_e32 v24, v0 -; ASM-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2 -; ASM-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 -; ASM-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 -; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v19 -; ASM-GISEL-NEXT: v_dual_mov_b32 v19, v20 :: v_dual_and_b32 v20, 1, v24 -; ASM-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v18 -; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; ASM-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; ASM-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; ASM-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; ASM-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; ASM-GISEL-NEXT: s_clause 0x15 +; ASM-GISEL-NEXT: scratch_load_b32 v33, off, s32 +; ASM-GISEL-NEXT: scratch_load_b32 v34, off, s32 offset:4 +; ASM-GISEL-NEXT: scratch_load_b32 v35, off, s32 offset:8 +; ASM-GISEL-NEXT: scratch_load_b32 v36, off, s32 offset:12 +; ASM-GISEL-NEXT: scratch_load_b32 v37, off, s32 offset:16 +; ASM-GISEL-NEXT: scratch_load_b32 v38, off, s32 offset:20 +; ASM-GISEL-NEXT: scratch_load_b32 v39, off, s32 offset:24 +; ASM-GISEL-NEXT: scratch_load_b32 v48, off, s32 offset:28 +; ASM-GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:32 +; ASM-GISEL-NEXT: scratch_load_b32 v50, off, s32 offset:36 +; ASM-GISEL-NEXT: scratch_load_b32 v51, off, s32 offset:40 +; ASM-GISEL-NEXT: scratch_load_b32 v52, off, s32 offset:44 +; ASM-GISEL-NEXT: scratch_load_b32 v53, off, s32 offset:48 +; ASM-GISEL-NEXT: scratch_load_b32 v54, off, s32 offset:52 +; ASM-GISEL-NEXT: scratch_load_b32 v55, off, s32 offset:56 +; ASM-GISEL-NEXT: scratch_load_b32 v64, off, s32 offset:60 +; ASM-GISEL-NEXT: scratch_load_b32 v65, off, s32 offset:64 +; ASM-GISEL-NEXT: scratch_load_b32 v66, off, s32 offset:68 +; ASM-GISEL-NEXT: scratch_load_b32 v67, off, s32 offset:72 +; ASM-GISEL-NEXT: scratch_load_b32 v31, off, s32 offset:76 +; ASM-GISEL-NEXT: scratch_load_b32 v32, off, s32 offset:80 +; ASM-GISEL-NEXT: scratch_load_b32 v68, off, s32 offset:84 +; ASM-GISEL-NEXT: v_and_b32_e32 v1, 1, v1 +; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo +; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v1 ; ASM-GISEL-NEXT: s_cbranch_execz .LBB3_2 ; ASM-GISEL-NEXT: ; %bb.1: ; %if.then -; ASM-GISEL-NEXT: s_movk_i32 s1, 0x3e00 -; ASM-GISEL-NEXT: s_mov_b32 s2, 0 -; ASM-GISEL-NEXT: s_wait_alu 0xfffe -; ASM-GISEL-NEXT: s_lshr_b32 s3, s0, 16 -; ASM-GISEL-NEXT: s_lshr_b32 s4, s0, 16 -; ASM-GISEL-NEXT: s_lshr_b32 s5, s0, 16 +; ASM-GISEL-NEXT: s_mov_b32 s1, 0 +; ASM-GISEL-NEXT: s_movk_i32 s2, 0x3e00 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x0 ; ASM-GISEL-NEXT: s_wait_alu 0xfffe -; ASM-GISEL-NEXT: v_dual_mov_b32 v25, s5 :: v_dual_add_nc_u32 v0, 15, v23 -; ASM-GISEL-NEXT: v_mov_b32_e32 v2, s1 -; ASM-GISEL-NEXT: v_mov_b32_e32 v6, s2 -; ASM-GISEL-NEXT: v_mov_b32_e32 v20, s3 -; ASM-GISEL-NEXT: v_mov_b32_e32 v24, s4 -; ASM-GISEL-NEXT: global_store_b32 v[21:22], v0, off -; ASM-GISEL-NEXT: ; implicit-def: $vgpr0 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr1 +; ASM-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_add_nc_u32 v1, 15, v68 +; ASM-GISEL-NEXT: v_mov_b32_e32 v8, s1 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr2 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr3 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr4 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr5 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15_vgpr16 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr17 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr18 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr6 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr7 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12_vgpr13 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr18 ; ASM-GISEL-NEXT: ; implicit-def: $vgpr19 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr20 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr21 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr22 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr23 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr24 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr25 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr26 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr27 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr28 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr29 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr30 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr33 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr34 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr35 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr36 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr37 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr38 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr39 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr48 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr49 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr50 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr51 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr52 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr53 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr54 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr55 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr64 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr65 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr66 +; ASM-GISEL-NEXT: ; implicit-def: $vgpr67 +; ASM-GISEL-NEXT: global_store_b32 v[31:32], v1, off ; ASM-GISEL-NEXT: .LBB3_2: ; %if.end ; ASM-GISEL-NEXT: s_wait_alu 0xfffe ; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; ASM-GISEL-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; ASM-GISEL-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; ASM-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; ASM-GISEL-NEXT: v_lshl_or_b32 v20, v20, 16, v17 -; ASM-GISEL-NEXT: v_lshl_or_b32 v17, v24, 16, v18 -; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) -; ASM-GISEL-NEXT: v_lshl_or_b32 v18, v25, 16, v7 -; ASM-GISEL-NEXT: v_dual_mov_b32 v7, v8 :: v_dual_mov_b32 v8, v9 -; ASM-GISEL-NEXT: v_dual_mov_b32 v9, v10 :: v_dual_mov_b32 v10, v11 -; ASM-GISEL-NEXT: v_dual_mov_b32 v11, v12 :: v_dual_mov_b32 v12, v13 -; ASM-GISEL-NEXT: v_dual_mov_b32 v13, v14 :: v_dual_mov_b32 v14, v15 -; ASM-GISEL-NEXT: v_dual_mov_b32 v15, v16 :: v_dual_mov_b32 v16, v20 +; ASM-GISEL-NEXT: s_clause 0x16 +; ASM-GISEL-NEXT: scratch_store_b8 v0, v2, off +; ASM-GISEL-NEXT: scratch_store_b16 v0, v3, off offset:2 +; ASM-GISEL-NEXT: scratch_store_b16 v0, v4, off offset:4 +; ASM-GISEL-NEXT: scratch_store_b16 v0, v5, off offset:6 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v6, off offset:8 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v7, off offset:12 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v8, off offset:16 +; ASM-GISEL-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v13, off offset:48 +; ASM-GISEL-NEXT: scratch_store_b128 v0, v[14:17], off offset:64 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v18, off offset:80 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v19, off offset:84 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v20, off offset:88 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v21, off offset:92 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v22, off offset:96 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v23, off offset:100 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v24, off offset:104 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v25, off offset:108 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v26, off offset:112 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v27, off offset:116 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v28, off offset:120 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v29, off offset:124 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v30, off offset:128 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x15 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v33, off offset:132 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x14 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v34, off offset:136 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x13 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v35, off offset:140 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x12 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v36, off offset:144 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x11 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v37, off offset:148 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x10 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v38, off offset:152 +; ASM-GISEL-NEXT: s_wait_loadcnt 0xf +; ASM-GISEL-NEXT: scratch_store_b32 v0, v39, off offset:156 +; ASM-GISEL-NEXT: s_wait_loadcnt 0xe +; ASM-GISEL-NEXT: scratch_store_b32 v0, v48, off offset:160 +; ASM-GISEL-NEXT: s_wait_loadcnt 0xd +; ASM-GISEL-NEXT: scratch_store_b32 v0, v49, off offset:164 +; ASM-GISEL-NEXT: s_wait_loadcnt 0xc +; ASM-GISEL-NEXT: scratch_store_b32 v0, v50, off offset:168 +; ASM-GISEL-NEXT: s_wait_loadcnt 0xb +; ASM-GISEL-NEXT: scratch_store_b32 v0, v51, off offset:172 +; ASM-GISEL-NEXT: s_wait_loadcnt 0xa +; ASM-GISEL-NEXT: scratch_store_b32 v0, v52, off offset:176 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x9 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v53, off offset:180 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x8 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v54, off offset:184 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x7 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v55, off offset:188 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x6 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v64, off offset:192 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x5 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v65, off offset:196 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x4 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v66, off offset:200 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x3 +; ASM-GISEL-NEXT: scratch_store_b32 v0, v67, off offset:204 +; ASM-GISEL-NEXT: s_wait_loadcnt 0x0 ; ASM-GISEL-NEXT: s_setpc_b64 s[30:31] entry: br i1 %cond, label %if.then, label %if.end From d791af54ab9c2153fbfbf572c1680e86411e91b4 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Mon, 5 May 2025 11:17:11 +0200 Subject: [PATCH 4/4] Use range-based for --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0c1d2d8125dbf..30badff386cc2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9122,8 +9122,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return lowerLaneOp(*this, Op.getNode(), DAG); case Intrinsic::amdgcn_dead: { SmallVector Poisons; - for (unsigned I = 0, E = Op.getNode()->getNumValues(); I != E; ++I) - Poisons.push_back(DAG.getPOISON(Op.getNode()->getValueType(I))); + for (const EVT ValTy : Op.getNode()->values()) + Poisons.push_back(DAG.getPOISON(ValTy)); return DAG.getMergeValues(Poisons, SDLoc(Op)); } default: