From b2c7a7636101dfd2c56be5d328818ae9b2df0e77 Mon Sep 17 00:00:00 2001 From: Abhinav Date: Thu, 7 Nov 2024 11:34:15 +0530 Subject: [PATCH 1/2] CodeGen using True16 D16 LDS ld/st pseudo instructions Implement new pseudos with the suffix _t16 which have VGPR_16 as the store src or load dst. This affects LDS 8 and 16-bit loads and stores. Lower the pseudos to the existing real instructions in MC inst layer with VGPR_32 src or dst (which makes them consistent with the hardware encoding). This patch reduces VGPR usage by making hi halves of VGPRs available for other values. Modified lit tests. --- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 41 ++ llvm/lib/Target/AMDGPU/DSInstructions.td | 87 ++- .../AMDGPU/GlobalISel/store-local.128.ll | 61 ++- .../AMDGPU/GlobalISel/store-local.96.ll | 44 +- llvm/test/CodeGen/AMDGPU/atomic_load_local.ll | 509 ++++++++++++++---- .../test/CodeGen/AMDGPU/atomic_store_local.ll | 486 ++++++++++++++--- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 92 ++-- llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll | 28 +- 8 files changed, 1043 insertions(+), 305 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 895d1e77bf1c4..edf62d25e0f8e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -187,6 +187,47 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.addOperand(Dest); OutMI.addOperand(Src); return; + } else if (const auto *Info = AMDGPU::getT16D16Helper(Opcode)) { + uint16_t OpName = AMDGPU::OpName::OPERAND_LAST; + if (TII->isDS(Opcode)) { + if (MI->mayLoad()) + OpName = llvm::AMDGPU::OpName::vdst; + else if (MI->mayStore()) + OpName = llvm::AMDGPU::OpName::data0; + else + llvm_unreachable("LDS load or store expected"); + } else { + OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata) + ? llvm::AMDGPU::OpName::vdata + : llvm::AMDGPU::OpName::vdst; + } + int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName); + MachineOperand MIVDstOrVData = MI->getOperand(VDstOrVDataIdx); + bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI); + Opcode = IsHi ? Info->HiOp : Info->LoOp; + MIVDstOrVData.clearParent(); // Avoid use list error in setReg call + MIVDstOrVData.setReg(TRI.get32BitRegister(MIVDstOrVData.getReg())); + + int MCOpcode = TII->pseudoToMCOpcode(Opcode); + assert(MCOpcode != -1 && + "Pseudo instruction doesn't have a target-specific version"); + OutMI.setOpcode(MCOpcode); + for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) { + const MachineOperand &MO = MI->getOperand(I); + MCOperand MCOp; + if (I == VDstOrVDataIdx) + lowerOperand(MIVDstOrVData, MCOp); + else + lowerOperand(MO, MCOp); + OutMI.addOperand(MCOp); + } + + if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) { + MCOperand MCOp; + lowerOperand(MIVDstOrVData, MCOp); + OutMI.addOperand(MCOp); + } + return; } else if (Opcode == AMDGPU::SI_TCRETURN || Opcode == AMDGPU::SI_TCRETURN_GFX) { // TODO: How to use branch immediate and avoid register+add? diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index d3487daee364f..e1e7433b04697 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -127,6 +127,15 @@ multiclass DS_1A1D_NORET_mc { } } +multiclass DS_1A1D_NORET_t16 +: DS_1A1D_NORET_mc { + let has_m0_read = 0 in { + let True16Predicate = UseRealTrue16Insts in { + def "_t16" : DS_1A1D_NORET, True16D16Table; + } + } +} + multiclass DS_1A1D_NORET_mc_gfx9 { let has_m0_read = 0 in { def "" : DS_1A1D_NORET; @@ -294,6 +303,15 @@ multiclass DS_1A_RET_mc +: DS_1A_RET_mc { + let has_m0_read = 0 in { + let True16Predicate = UseRealTrue16Insts in { + def "_t16" : DS_1A_RET, True16D16Table; + } + } +} + multiclass DS_1A_RET_NoM0 { let has_m0_read = 0 in { def "" : DS_1A_RET; @@ -457,8 +475,6 @@ defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">; defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">; let mayLoad = 0 in { -defm DS_WRITE_B8 : DS_1A1D_NORET_mc<"ds_write_b8">; -defm DS_WRITE_B16 : DS_1A1D_NORET_mc<"ds_write_b16">; defm DS_WRITE_B32 : DS_1A1D_NORET_mc<"ds_write_b32">; defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET_mc<"ds_write2_b32">; defm DS_WRITE2ST64_B32: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b32">; @@ -473,6 +489,9 @@ def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">; } // End has_m0_read = 0 +defm DS_WRITE_B8 : DS_1A1D_NORET_t16<"ds_write_b8">; +defm DS_WRITE_B16 : DS_1A1D_NORET_t16<"ds_write_b16">; + let SubtargetPredicate = HasDSAddTid in { def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">; } @@ -625,10 +644,7 @@ def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>; } let mayStore = 0 in { -defm DS_READ_I8 : DS_1A_RET_mc<"ds_read_i8">; -defm DS_READ_U8 : DS_1A_RET_mc<"ds_read_u8">; defm DS_READ_I16 : DS_1A_RET_mc<"ds_read_i16">; -defm DS_READ_U16 : DS_1A_RET_mc<"ds_read_u16">; defm DS_READ_B32 : DS_1A_RET_mc<"ds_read_b32">; defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", VReg_64>; @@ -649,6 +665,10 @@ def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">; } } // End has_m0_read = 0 +defm DS_READ_I8 : DS_1A_RET_t16<"ds_read_i8">; +defm DS_READ_U8 : DS_1A_RET_t16<"ds_read_u8">; +defm DS_READ_U16 : DS_1A_RET_t16<"ds_read_u16">; + let SubtargetPredicate = HasDSAddTid in { def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">; } @@ -784,34 +804,51 @@ multiclass DSReadPat_mc { } } +multiclass DSReadPat_t16 { + + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSReadPat(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in + let True16Predicate = p in { + def : DSReadPat(!cast(inst)#"_gfx9"), vt, !cast(frag)>; + } + let True16Predicate = UseRealTrue16Insts in { + def : DSReadPat(!cast(inst)#"_t16"), vt, !cast(frag)>; + } + } +} + class DSReadPat_D16 : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in), (inst $ptr, Offset:$offset, (i1 0), $in) >; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; -defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; +defm : DSReadPat_t16 ; +defm : DSReadPat_t16 ; +defm : DSReadPat_t16 ; foreach vt = Reg32Types.types in { defm : DSReadPat_mc ; } -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; @@ -850,18 +887,34 @@ multiclass DSWritePat_mc { } } +multiclass DSWritePat_t16 { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSWritePat(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in + let True16Predicate = p in { + def : DSWritePat(!cast(inst)#"_gfx9"), vt, !cast(frag)>; + } + let True16Predicate = UseRealTrue16Insts in { + def : DSWritePat(!cast(inst)#"_t16"), vt, !cast(frag)>; + } + } +} + defm : DSWritePat_mc ; defm : DSWritePat_mc ; -defm : DSWritePat_mc ; -defm : DSWritePat_mc ; +defm : DSWritePat_t16 ; +defm : DSWritePat_t16 ; foreach vt = Reg32Types.types in { defm : DSWritePat_mc ; } -defm : DSWritePat_mc ; +defm : DSWritePat_t16 ; defm : DSWritePat_mc ; -defm : DSWritePat_mc ; +defm : DSWritePat_t16 ; defm : DSWritePat_mc ; defm : DSWritePat_mc ; defm : DSWritePat_mc ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index e81bae5d3a416..f6fbae88dc84a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -239,48 +239,53 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s6, 0xffff, s0 ; GFX11-NEXT: s_lshr_b32 s5, s0, 16 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-NEXT: s_lshr_b32 s0, s1, 16 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b16_e32 v0.h, s1 ; GFX11-NEXT: s_lshr_b32 s1, s2, 16 ; GFX11-NEXT: s_and_b32 s7, 0xffff, s2 +; GFX11-NEXT: v_mov_b16_e32 v1.l, s2 ; GFX11-NEXT: s_lshr_b32 s2, s6, 8 ; GFX11-NEXT: s_lshr_b32 s6, s5, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6 -; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_mov_b16_e32 v2.h, s2 +; GFX11-NEXT: v_mov_b16_e32 v1.h, s5 ; GFX11-NEXT: s_lshr_b32 s4, s4, 8 ; GFX11-NEXT: s_lshr_b32 s5, s0, 8 +; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-NEXT: s_lshr_b32 s0, s7, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 -; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: ds_store_b8 v1, v6 offset:1 -; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 -; GFX11-NEXT: ds_store_b8 v1, v7 offset:3 -; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v1, v8 offset:5 -; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 -; GFX11-NEXT: ds_store_b8 v1, v9 offset:7 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_mov_b16_e32 v3.l, s6 +; GFX11-NEXT: v_mov_b16_e32 v3.h, s4 +; GFX11-NEXT: v_mov_b16_e32 v4.l, s5 +; GFX11-NEXT: ds_store_b8 v5, v0 +; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:1 +; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:2 +; GFX11-NEXT: ds_store_b8 v5, v3 offset:3 +; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:4 +; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:5 +; GFX11-NEXT: ds_store_b8 v5, v2 offset:6 +; GFX11-NEXT: ds_store_b8 v5, v4 offset:7 +; GFX11-NEXT: v_mov_b16_e32 v0.l, s0 ; GFX11-NEXT: s_lshr_b32 s0, s1, 8 -; GFX11-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: v_mov_b16_e32 v0.h, s1 +; GFX11-NEXT: v_mov_b16_e32 v1.h, s0 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s3 ; GFX11-NEXT: s_lshr_b32 s1, s3, 16 ; GFX11-NEXT: s_lshr_b32 s0, s0, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX11-NEXT: v_mov_b16_e32 v2.l, s3 +; GFX11-NEXT: v_mov_b16_e32 v2.h, s0 ; GFX11-NEXT: s_lshr_b32 s0, s1, 8 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 -; GFX11-NEXT: ds_store_b8 v1, v0 offset:9 -; GFX11-NEXT: ds_store_b8 v1, v2 offset:10 -; GFX11-NEXT: ds_store_b8 v1, v4 offset:11 -; GFX11-NEXT: ds_store_b8 v1, v5 offset:12 -; GFX11-NEXT: ds_store_b8 v1, v6 offset:13 -; GFX11-NEXT: ds_store_b8 v1, v7 offset:14 -; GFX11-NEXT: ds_store_b8 v1, v8 offset:15 +; GFX11-NEXT: v_mov_b16_e32 v3.l, s1 +; GFX11-NEXT: v_mov_b16_e32 v3.h, s0 +; GFX11-NEXT: ds_store_b8 v5, v1 offset:8 +; GFX11-NEXT: ds_store_b8 v5, v0 offset:9 +; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:10 +; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:11 +; GFX11-NEXT: ds_store_b8 v5, v2 offset:12 +; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:13 +; GFX11-NEXT: ds_store_b8 v5, v3 offset:14 +; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:15 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 030f01a8bd5ea..27816a9375d30 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -207,36 +207,42 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s0 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: v_mov_b16_e32 v0.l, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v6, s3 ; GFX11-NEXT: s_lshr_b32 s0, s1, 16 ; GFX11-NEXT: s_and_b32 s3, 0xffff, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b16_e32 v0.h, s1 ; GFX11-NEXT: s_lshr_b32 s1, s2, 16 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX11-NEXT: v_mov_b16_e32 v1.l, s2 ; GFX11-NEXT: s_lshr_b32 s2, s5, 8 ; GFX11-NEXT: s_lshr_b32 s5, s4, 8 -; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-NEXT: v_mov_b16_e32 v1.h, s4 ; GFX11-NEXT: s_lshr_b32 s3, s3, 8 ; GFX11-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-NEXT: s_lshr_b32 s0, s6, 8 ; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s3 -; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v12, s6 -; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 -; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 -; GFX11-NEXT: ds_store_b8 v1, v8 offset:3 -; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v1, v9 offset:5 -; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 -; GFX11-NEXT: ds_store_b8 v1, v10 offset:7 -; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 -; GFX11-NEXT: ds_store_b8 v1, v11 offset:9 -; GFX11-NEXT: ds_store_b8 v1, v6 offset:10 -; GFX11-NEXT: ds_store_b8 v1, v12 offset:11 +; GFX11-NEXT: v_mov_b16_e32 v3.h, s5 +; GFX11-NEXT: v_mov_b16_e32 v2.h, s1 +; GFX11-NEXT: v_mov_b16_e32 v4.l, s3 +; GFX11-NEXT: v_mov_b16_e32 v4.h, s4 +; GFX11-NEXT: v_mov_b16_e32 v5.l, s0 +; GFX11-NEXT: v_mov_b16_e32 v5.h, s6 +; GFX11-NEXT: ds_store_b8 v6, v0 +; GFX11-NEXT: ds_store_b8 v6, v3 offset:1 +; GFX11-NEXT: ds_store_b8_d16_hi v6, v1 offset:2 +; GFX11-NEXT: ds_store_b8_d16_hi v6, v3 offset:3 +; GFX11-NEXT: ds_store_b8_d16_hi v6, v0 offset:4 +; GFX11-NEXT: ds_store_b8 v6, v4 offset:5 +; GFX11-NEXT: ds_store_b8 v6, v2 offset:6 +; GFX11-NEXT: ds_store_b8_d16_hi v6, v4 offset:7 +; GFX11-NEXT: ds_store_b8 v6, v1 offset:8 +; GFX11-NEXT: ds_store_b8 v6, v5 offset:9 +; GFX11-NEXT: ds_store_b8_d16_hi v6, v2 offset:10 +; GFX11-NEXT: ds_store_b8_d16_hi v6, v5 offset:11 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out, align 1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll index a3b6c283512f3..7f45b038b6d0d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll @@ -1,208 +1,493 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; GCN-LABEL: {{^}}atomic_load_monotonic_i8: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u8 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i8: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u8 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u8 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u8_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1 ret i8 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i8_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u8 v0, v0 offset:16{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i8 @atomic_load_monotonic_i8_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i8_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u8 v0, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i8_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u8 v0, v0 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u8_d16 v0, v0 offset:16 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0 offset:16 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16 %load = load atomic i8, ptr addrspace(3) %gep monotonic, align 1 ret i8 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_i16(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 ret i16 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i16_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_i16_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i16_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i16_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16 %load = load atomic i16, ptr addrspace(3) %gep monotonic, align 2 ret i16 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i32: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b32 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i32 @atomic_load_monotonic_i32(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load atomic i32, ptr addrspace(3) %ptr monotonic, align 4 ret i32 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i32_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i32 @atomic_load_monotonic_i32_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 offset:64 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v0, v0 offset:64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16 %load = load atomic i32, ptr addrspace(3) %gep monotonic, align 4 ret i32 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i64: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i64 @atomic_load_monotonic_i64(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b64 v[0:1], v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b64 v[0:1], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load atomic i64, ptr addrspace(3) %ptr monotonic, align 8 ret i64 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i64_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i64 @atomic_load_monotonic_i64_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 16 %load = load atomic i64, ptr addrspace(3) %gep monotonic, align 8 ret i64 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_f32_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define float @atomic_load_monotonic_f32_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_f32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 offset:64 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_f32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_f32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v0, v0 offset:64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, ptr addrspace(3) %ptr, i32 16 %load = load atomic float, ptr addrspace(3) %gep monotonic, align 4 ret float %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_f64_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define double @atomic_load_monotonic_f64_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_f64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_f64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_f64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds double, ptr addrspace(3) %ptr, i32 16 %load = load atomic double, ptr addrspace(3) %gep monotonic, align 8 ret double %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_p0i8_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define ptr @atomic_load_monotonic_p0i8_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_p0i8_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_p0i8_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_p0i8_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds ptr, ptr addrspace(3) %ptr, i32 16 %load = load atomic ptr, ptr addrspace(3) %gep monotonic, align 8 ret ptr %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_p3i8_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_p3i8_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 offset:64 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_p3i8_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_p3i8_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v0, v0 offset:64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %ptr, i32 16 %load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4 ret ptr addrspace(3) %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_f16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_f16(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2 %ret = bitcast half %load to i16 ret i16 %ret } -; GCN-LABEL: {{^}}atomic_load_monotonic_f16_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_f16_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_f16_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_f16_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_f16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_f16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16 %load = load atomic half, ptr addrspace(3) %gep monotonic, align 2 %ret = bitcast half %load to i16 ret i16 %ret } -; GCN-LABEL: {{^}}atomic_load_monotonic_bf16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_bf16(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_bf16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2 %ret = bitcast bfloat %load to i16 ret i16 %ret } -; GCN-LABEL: {{^}}atomic_load_monotonic_bf16_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_bf16_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_bf16_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_bf16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_bf16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16 %load = load atomic bfloat, ptr addrspace(3) %gep monotonic, align 2 %ret = bitcast bfloat %load to i16 ret i16 %ret } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll index cd1e1fb1add47..9236b4018317a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll @@ -1,156 +1,470 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; GCN-LABEL: {{^}}atomic_store_monotonic_i8: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b8 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) { +; CI-LABEL: atomic_store_monotonic_i8: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b8 v0, v1 +; CI-NEXT: ds_write_b8 v0, v2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b8 v0, v1 +; GFX11-TRUE16-NEXT: ds_store_b8_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %val1 = add i8 %val, 2 store atomic i8 %val, ptr addrspace(3) %ptr monotonic, align 1 + store atomic i8 %val1, ptr addrspace(3) %ptr monotonic, align 1 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i8: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b8 v0, v1 offset:16{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i8(ptr addrspace(3) %ptr, i8 %val) { - %gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16 - store atomic i8 %val, ptr addrspace(3) %gep monotonic, align 1 +; CI-LABEL: atomic_store_monotonic_offset_i8: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b8 v0, v1 offset:8 +; CI-NEXT: ds_write_b8 v0, v2 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b8 v0, v1 offset:8 +; GFX11-TRUE16-NEXT: ds_store_b8_d16_hi v0, v1 offset:16 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 offset:8 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2 offset:16 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %val1 = add i8 %val, 2 + %gep_1 = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 8 + %gep_2 = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16 + store atomic i8 %val, ptr addrspace(3) %gep_1 monotonic, align 1 + store atomic i8 %val1, ptr addrspace(3) %gep_2 monotonic, align 1 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_i16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i16(ptr addrspace(3) %ptr, i16 %val) { +; CI-LABEL: atomic_store_monotonic_i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 +; CI-NEXT: ds_write_b16 v0, v2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %val1 = add i16 %val, 2 store atomic i16 %val, ptr addrspace(3) %ptr monotonic, align 2 + store atomic i16 %val1, ptr addrspace(3) %ptr monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i16(ptr addrspace(3) %ptr, i16 %val) { +; CI-LABEL: atomic_store_monotonic_offset_i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 offset:32 +; CI-NEXT: ds_write_b16 v0, v2 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:32 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %val1 = add i16 %val, 2 %gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16 store atomic i16 %val, ptr addrspace(3) %gep monotonic, align 2 + store atomic i16 %val1, ptr addrspace(3) %gep monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_i32: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b32 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i32(ptr addrspace(3) %ptr, i32 %val) { +; CI-LABEL: atomic_store_monotonic_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_store_monotonic_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store atomic i32 %val, ptr addrspace(3) %ptr monotonic, align 4 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i32: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b32 v0, v1 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i32(ptr addrspace(3) %ptr, i32 %val) { +; CI-LABEL: atomic_store_monotonic_offset_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v1 offset:64 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v1 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_store_monotonic_offset_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_store_b32 v0, v1 offset:64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16 store atomic i32 %val, ptr addrspace(3) %gep monotonic, align 4 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_i64: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b64 v0, v[1:2]{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i64(ptr addrspace(3) %ptr, i64 %val) { +; CI-LABEL: atomic_store_monotonic_i64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b64 v0, v[1:2] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b64 v0, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_store_monotonic_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_store_b64 v0, v[1:2] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store atomic i64 %val, ptr addrspace(3) %ptr monotonic, align 8 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i64: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b64 v0, v[1:2] offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i64(ptr addrspace(3) %ptr, i64 %val) { +; CI-LABEL: atomic_store_monotonic_offset_i64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b64 v0, v[1:2] offset:128 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b64 v0, v[1:2] offset:128 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_store_monotonic_offset_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_store_b64 v0, v[1:2] offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i64 16 store atomic i64 %val, ptr addrspace(3) %gep monotonic, align 8 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_f16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_f16(ptr addrspace(3) %ptr, i16 %arg.val) { +; CI-LABEL: atomic_store_monotonic_f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 +; CI-NEXT: ds_write_b16 v0, v2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %arg.val1 = add i16 %arg.val, 2 %val = bitcast i16 %arg.val to half + %val1 = bitcast i16 %arg.val1 to half store atomic half %val, ptr addrspace(3) %ptr monotonic, align 2 + store atomic half %val1, ptr addrspace(3) %ptr monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_f16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_f16(ptr addrspace(3) %ptr, i16 %arg.val) { +; CI-LABEL: atomic_store_monotonic_offset_f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 offset:32 +; CI-NEXT: ds_write_b16 v0, v2 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:32 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %arg.val1 = add i16 %arg.val, 2 + %val1 = bitcast i16 %arg.val1 to half %val = bitcast i16 %arg.val to half %gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16 store atomic half %val, ptr addrspace(3) %gep monotonic, align 2 + store atomic half %val1, ptr addrspace(3) %gep monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_bf16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_bf16(ptr addrspace(3) %ptr, i16 %arg.val) { +; CI-LABEL: atomic_store_monotonic_bf16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 +; CI-NEXT: ds_write_b16 v0, v2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %arg.val1 = add i16 %arg.val, 2 + %val1 = bitcast i16 %arg.val1 to bfloat %val = bitcast i16 %arg.val to bfloat store atomic bfloat %val, ptr addrspace(3) %ptr monotonic, align 2 + store atomic bfloat %val1, ptr addrspace(3) %ptr monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_bf16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_bf16(ptr addrspace(3) %ptr, i16 %arg.val) { +; CI-LABEL: atomic_store_monotonic_offset_bf16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 offset:32 +; CI-NEXT: ds_write_b16 v0, v2 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:32 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %arg.val1 = add i16 %arg.val, 2 + %val1 = bitcast i16 %arg.val1 to bfloat %val = bitcast i16 %arg.val to bfloat %gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16 store atomic bfloat %val, ptr addrspace(3) %gep monotonic, align 2 + store atomic bfloat %val1, ptr addrspace(3) %gep monotonic, align 2 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 806fe899a9149..c739ba2183ef9 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -224,15 +224,25 @@ define <2 x half> @chain_hi_to_lo_group() { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_group: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: ds_load_u16 v0, v1 offset:2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_group: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v1 offset:2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_group: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v1 offset:2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, ptr addrspace(3) null, i64 1 %load_lo = load half, ptr addrspace(3) %gep_lo @@ -263,14 +273,23 @@ define <2 x half> @chain_hi_to_lo_group_different_bases(ptr addrspace(3) %base_l ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_group_different_bases: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_u16 v0, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_different_bases: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_different_bases: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, ptr addrspace(3) %base_lo %load_hi = load half, ptr addrspace(3) %base_hi @@ -780,16 +799,27 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_u16 v1, v0 offset:2 -; GFX11-NEXT: ds_load_u16_d16_hi v0, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v1, v0 offset:2 +; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v1, v0 offset:2 +; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1 %load_lo = load volatile i16, ptr addrspace(3) %gep_lo @@ -1047,12 +1077,12 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt ; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_may_alias_store: ; GFX11-TRUE16: ; %bb.0: ; %bb ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0x7b -; GFX11-TRUE16-NEXT: ds_load_u16 v3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x7b +; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v2, v0 ; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2 -; GFX11-TRUE16-NEXT: ds_load_u16 v0, v0 offset:2 +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v2, v0 offset:2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_may_alias_store: diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 8f702da64c508..bd4d640efb050 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -229,10 +229,11 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 { ; ; GFX11-LABEL: add_x_shl_max_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX11-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-NEXT: ds_store_b8 v1, v0 offset:65535 ; GFX11-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() %shl = shl i32 %x.i, 4 @@ -273,11 +274,12 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: ds_store_b8 v0, v1 +; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX11-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-NEXT: ds_store_b8 v1, v0 ; GFX11-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() %.neg = mul i32 %x.i, -4 @@ -318,11 +320,12 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: ds_store_b8 v0, v1 +; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX11-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-NEXT: ds_store_b8 v1, v0 ; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i @@ -361,11 +364,12 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 -; GFX11-NEXT: ds_store_b8 v0, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x10000, v0 +; GFX11-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-NEXT: ds_store_b8 v1, v0 ; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i From 518d4ec9ad6706494accd83cf1a3bec809cc8e20 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Sat, 15 Mar 2025 00:34:28 -0400 Subject: [PATCH 2/2] fix test --- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 41 - .../AMDGPU/GlobalISel/store-local.128.ll | 61 +- .../AMDGPU/GlobalISel/store-local.96.ll | 44 +- llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll | 108 +- .../CodeGen/AMDGPU/integer-mad-patterns.ll | 1748 +++++++++++------ 5 files changed, 1301 insertions(+), 701 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index edf62d25e0f8e..895d1e77bf1c4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -187,47 +187,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.addOperand(Dest); OutMI.addOperand(Src); return; - } else if (const auto *Info = AMDGPU::getT16D16Helper(Opcode)) { - uint16_t OpName = AMDGPU::OpName::OPERAND_LAST; - if (TII->isDS(Opcode)) { - if (MI->mayLoad()) - OpName = llvm::AMDGPU::OpName::vdst; - else if (MI->mayStore()) - OpName = llvm::AMDGPU::OpName::data0; - else - llvm_unreachable("LDS load or store expected"); - } else { - OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata) - ? llvm::AMDGPU::OpName::vdata - : llvm::AMDGPU::OpName::vdst; - } - int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName); - MachineOperand MIVDstOrVData = MI->getOperand(VDstOrVDataIdx); - bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI); - Opcode = IsHi ? Info->HiOp : Info->LoOp; - MIVDstOrVData.clearParent(); // Avoid use list error in setReg call - MIVDstOrVData.setReg(TRI.get32BitRegister(MIVDstOrVData.getReg())); - - int MCOpcode = TII->pseudoToMCOpcode(Opcode); - assert(MCOpcode != -1 && - "Pseudo instruction doesn't have a target-specific version"); - OutMI.setOpcode(MCOpcode); - for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) { - const MachineOperand &MO = MI->getOperand(I); - MCOperand MCOp; - if (I == VDstOrVDataIdx) - lowerOperand(MIVDstOrVData, MCOp); - else - lowerOperand(MO, MCOp); - OutMI.addOperand(MCOp); - } - - if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) { - MCOperand MCOp; - lowerOperand(MIVDstOrVData, MCOp); - OutMI.addOperand(MCOp); - } - return; } else if (Opcode == AMDGPU::SI_TCRETURN || Opcode == AMDGPU::SI_TCRETURN_GFX) { // TODO: How to use branch immediate and avoid register+add? diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index f6fbae88dc84a..e81bae5d3a416 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -239,53 +239,48 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s6, 0xffff, s0 ; GFX11-NEXT: s_lshr_b32 s5, s0, 16 -; GFX11-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: s_lshr_b32 s0, s1, 16 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s1 -; GFX11-NEXT: v_mov_b16_e32 v0.h, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: s_lshr_b32 s1, s2, 16 ; GFX11-NEXT: s_and_b32 s7, 0xffff, s2 -; GFX11-NEXT: v_mov_b16_e32 v1.l, s2 ; GFX11-NEXT: s_lshr_b32 s2, s6, 8 ; GFX11-NEXT: s_lshr_b32 s6, s5, 8 -; GFX11-NEXT: v_mov_b16_e32 v2.h, s2 -; GFX11-NEXT: v_mov_b16_e32 v1.h, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0 ; GFX11-NEXT: s_lshr_b32 s4, s4, 8 ; GFX11-NEXT: s_lshr_b32 s5, s0, 8 -; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-NEXT: s_lshr_b32 s0, s7, 8 -; GFX11-NEXT: v_mov_b16_e32 v3.l, s6 -; GFX11-NEXT: v_mov_b16_e32 v3.h, s4 -; GFX11-NEXT: v_mov_b16_e32 v4.l, s5 -; GFX11-NEXT: ds_store_b8 v5, v0 -; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:1 -; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:2 -; GFX11-NEXT: ds_store_b8 v5, v3 offset:3 -; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:4 -; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:5 -; GFX11-NEXT: ds_store_b8 v5, v2 offset:6 -; GFX11-NEXT: ds_store_b8 v5, v4 offset:7 -; GFX11-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: ds_store_b8 v1, v0 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v9 offset:7 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: s_lshr_b32 s0, s1, 8 -; GFX11-NEXT: v_mov_b16_e32 v0.h, s1 -; GFX11-NEXT: v_mov_b16_e32 v1.h, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: v_mov_b32_e32 v4, s0 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s3 ; GFX11-NEXT: s_lshr_b32 s1, s3, 16 ; GFX11-NEXT: s_lshr_b32 s0, s0, 8 -; GFX11-NEXT: v_mov_b16_e32 v2.l, s3 -; GFX11-NEXT: v_mov_b16_e32 v2.h, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 ; GFX11-NEXT: s_lshr_b32 s0, s1, 8 -; GFX11-NEXT: v_mov_b16_e32 v3.l, s1 -; GFX11-NEXT: v_mov_b16_e32 v3.h, s0 -; GFX11-NEXT: ds_store_b8 v5, v1 offset:8 -; GFX11-NEXT: ds_store_b8 v5, v0 offset:9 -; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:10 -; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:11 -; GFX11-NEXT: ds_store_b8 v5, v2 offset:12 -; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:13 -; GFX11-NEXT: ds_store_b8 v5, v3 offset:14 -; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:15 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 +; GFX11-NEXT: ds_store_b8 v1, v0 offset:9 +; GFX11-NEXT: ds_store_b8 v1, v2 offset:10 +; GFX11-NEXT: ds_store_b8 v1, v4 offset:11 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:12 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:13 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:14 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:15 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 27816a9375d30..030f01a8bd5ea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -207,42 +207,36 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s0 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: v_mov_b16_e32 v0.l, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_lshr_b32 s0, s1, 16 ; GFX11-NEXT: s_and_b32 s3, 0xffff, s1 -; GFX11-NEXT: v_mov_b16_e32 v0.h, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: s_lshr_b32 s1, s2, 16 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s2 -; GFX11-NEXT: v_mov_b16_e32 v1.l, s2 ; GFX11-NEXT: s_lshr_b32 s2, s5, 8 ; GFX11-NEXT: s_lshr_b32 s5, s4, 8 -; GFX11-NEXT: v_mov_b16_e32 v3.l, s2 -; GFX11-NEXT: v_mov_b16_e32 v1.h, s4 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s0 ; GFX11-NEXT: s_lshr_b32 s3, s3, 8 ; GFX11-NEXT: s_lshr_b32 s4, s0, 8 -; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-NEXT: s_lshr_b32 s0, s6, 8 ; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: v_mov_b16_e32 v3.h, s5 -; GFX11-NEXT: v_mov_b16_e32 v2.h, s1 -; GFX11-NEXT: v_mov_b16_e32 v4.l, s3 -; GFX11-NEXT: v_mov_b16_e32 v4.h, s4 -; GFX11-NEXT: v_mov_b16_e32 v5.l, s0 -; GFX11-NEXT: v_mov_b16_e32 v5.h, s6 -; GFX11-NEXT: ds_store_b8 v6, v0 -; GFX11-NEXT: ds_store_b8 v6, v3 offset:1 -; GFX11-NEXT: ds_store_b8_d16_hi v6, v1 offset:2 -; GFX11-NEXT: ds_store_b8_d16_hi v6, v3 offset:3 -; GFX11-NEXT: ds_store_b8_d16_hi v6, v0 offset:4 -; GFX11-NEXT: ds_store_b8 v6, v4 offset:5 -; GFX11-NEXT: ds_store_b8 v6, v2 offset:6 -; GFX11-NEXT: ds_store_b8_d16_hi v6, v4 offset:7 -; GFX11-NEXT: ds_store_b8 v6, v1 offset:8 -; GFX11-NEXT: ds_store_b8 v6, v5 offset:9 -; GFX11-NEXT: ds_store_b8_d16_hi v6, v2 offset:10 -; GFX11-NEXT: ds_store_b8_d16_hi v6, v5 offset:11 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v12, s6 +; GFX11-NEXT: ds_store_b8 v1, v0 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 +; GFX11-NEXT: ds_store_b8 v1, v9 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v10 offset:7 +; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 +; GFX11-NEXT: ds_store_b8 v1, v11 offset:9 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:10 +; GFX11-NEXT: ds_store_b8 v1, v12 offset:11 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out, align 1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index bd4d640efb050..7819da8b97e55 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -227,14 +228,22 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 { ; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_x_shl_max_offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0 -; GFX11-NEXT: v_mov_b16_e32 v0.l, 13 -; GFX11-NEXT: ds_store_b8 v1, v0 offset:65535 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: add_x_shl_max_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 offset:65535 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: add_x_shl_max_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 offset:65535 +; GFX11-FAKE16-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() %shl = shl i32 %x.i, 4 %add = add i32 %shl, 65535 @@ -272,15 +281,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { ; GFX10-NEXT: ds_write_b8 v0, v1 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff, v0 -; GFX11-NEXT: v_mov_b16_e32 v0.l, 13 -; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 +; GFX11-FAKE16-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() %.neg = mul i32 %x.i, -4 %add = add i32 %.neg, 65535 @@ -318,15 +336,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 { ; GFX10-NEXT: ds_write_b8 v0, v1 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff, v0 -; GFX11-NEXT: v_mov_b16_e32 v0.l, 13 -; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 +; GFX11-FAKE16-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -362,15 +389,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { ; GFX10-NEXT: ds_write_b8 v0, v1 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x10000, v0 -; GFX11-NEXT: v_mov_b16_e32 v0.l, 13 -; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_p1: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v1, 0x10000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_p1: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 +; GFX11-FAKE16-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index dcb1d0e8c20a1..027576630c877 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -17,11 +17,15 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG,GFX1200-SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG,GFX1200-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL,GFX1200-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL,GFX1200-GISEL-FAKE16 %s ; Test for integer mad formation for patterns used in clpeak @@ -324,71 +328,137 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_imad_pat_i16: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv33 = add i16 %x, 1 %add = mul i16 %conv33, %y @@ -1461,71 +1531,137 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_umad_pat_i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_umad_pat_i16: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv33 = add i16 %x, 1 %add = mul i16 %conv33, %y @@ -4315,71 +4451,137 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i8: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i8: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_imad_pat_i8: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_imad_pat_i8: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_i8: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_i8: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_i8: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_i8: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv33 = add i8 %x, 1 %add = mul i8 %conv33, %y @@ -4584,113 +4786,221 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i8: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-SDAG-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u16 v4, v1, v3, v1 -; GFX11-SDAG-NEXT: v_mad_u16 v5, v0, v2, v0 -; GFX11-SDAG-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mul_lo_u16 v3, v4, v3 -; GFX11-SDAG-NEXT: v_mul_lo_u16 v2, v5, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v3, v1, v3 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i8: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 -; GFX11-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 -; GFX11-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_imad_pat_v2i8: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-SDAG-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_mad_u16 v4, v1, v3, v1 -; GFX1200-SDAG-NEXT: v_mad_u16 v5, v0, v2, v0 -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v3, v4, v3 -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v2, v5, v2 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v3, v1, v3 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1 -; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1200-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i8: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 -; GFX1200-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 -; GFX1200-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_v2i8: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v1.l, v0.h, v3.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v1.h, v0.l, v2.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.h, v1.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v1.l, v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v0.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v4, v1, v3, v1 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v5, v0, v2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v3, v4, v3 +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v2, v5, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v3, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v2, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_v2i8: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v0.l, v2.l, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v0.h, v3.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v3.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v2.h, v1.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v3.h, v1.h, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v1.l, v2.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v1.h, v3.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v2.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v3.h, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v1, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v3, v5, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v1.l, v0.h, v3.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v1.h, v0.l, v2.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v3.l +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v2.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.l, v1.l, v3.l +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.h, v1.h, v2.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v1.l, v0.h, v1.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v0.l, v1.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v4, v1, v3, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v5, v0, v2, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v3, v4, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v2, v5, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v3, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v2, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v0.l, v2.l, v0.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v0.h, v3.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v3.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v2.h, v1.l, v2.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v3.h, v1.h, v3.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v1.l, v2.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v1.h, v3.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v2.h, v0.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v3.h, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v1, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v3, v5, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i8> %x, %add = mul <2 x i8> %y18, %y @@ -7656,103 +7966,201 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_imad_pat_i16_x2: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16_x2: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv69 = add i16 %x, 1 %add = mul i16 %conv69, %y @@ -7915,103 +8323,201 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_umad_pat_i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_umad_pat_i16_x2: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16_x2: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv69 = add i16 %x, 1 %add = mul i16 %conv69, %y @@ -8792,51 +9298,95 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: multi_use_mul_mad_i16_var: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: multi_use_mul_mad_i16_var: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: multi_use_mul_mad_i16_var: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: multi_use_mul_mad_i16_var: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX1200-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: multi_use_mul_mad_i16_var: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: multi_use_mul_mad_i16_var: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v2.l, v0.l, v1.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v2.h, v0.l, v1.l, v3.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v2.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v3.l +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v2.l, v0.l, v1.l, v2.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v2.h, v0.l, v1.l, v3.l +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX1200-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i16 %x, %y %add0 = add i16 %mul, %z0 @@ -8956,27 +9506,93 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: other_use_mul_mad_i16_var: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX11-NEXT: ds_store_b16 v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-LABEL: other_use_mul_mad_i16_var: -; GFX1200: ; %bb.0: ; %entry -; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-NEXT: s_wait_expcnt 0x0 -; GFX1200-NEXT: s_wait_samplecnt 0x0 -; GFX1200-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX1200-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX1200-NEXT: ds_store_b16 v3, v4 -; GFX1200-NEXT: s_wait_dscnt 0x0 -; GFX1200-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: other_use_mul_mad_i16_var: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: other_use_mul_mad_i16_var: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: ds_store_b16 v3, v4 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: other_use_mul_mad_i16_var: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: other_use_mul_mad_i16_var: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: ds_store_b16 v3, v4 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: other_use_mul_mad_i16_var: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l +; GFX1200-SDAG-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: other_use_mul_mad_i16_var: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: ds_store_b16 v3, v4 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: other_use_mul_mad_i16_var: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l +; GFX1200-GISEL-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: other_use_mul_mad_i16_var: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX1200-GISEL-FAKE16-NEXT: ds_store_b16 v3, v4 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i16 %x, %y %add0 = add i16 %mul, %z